In [1]:
# Import dependencies
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the CSV data file containing polling info during the presidency
csv = 'Resources/approval_polllist.csv'

# Read the CSV file as a Pandas dataframe
df = pd.read_csv(csv)

# Show the dataframe
df

Unnamed: 0,president,subgroup,modeldate,startdate,enddate,pollster,grade,samplesize,population,weight,...,disapprove,adjusted_approve,adjusted_disapprove,multiversions,tracking,url,poll_id,question_id,createddate,timestamp
0,Donald Trump,All polls,8/31/2020,1/20/2017,1/22/2017,Gallup,B,1500.0,a,0.262323,...,45.0,45.765945,43.590621,,T,http://www.gallup.com/poll/201617/gallup-daily...,49253,77265,1/23/2017,8/31/2020 18:23
1,Donald Trump,All polls,8/31/2020,1/20/2017,1/22/2017,Morning Consult,B/C,1992.0,rv,0.680029,...,37.0,45.285652,37.781866,,,http://static.politico.com/9b/13/82a3baf542ae9...,49249,77261,1/23/2017,8/31/2020 18:23
2,Donald Trump,All polls,8/31/2020,1/20/2017,1/24/2017,Ipsos,B-,1632.0,a,0.153481,...,45.2,43.199569,43.853862,,T,http://polling.reuters.com/#poll/CP3_2/,49426,77599,3/1/2017,8/31/2020 18:23
3,Donald Trump,All polls,8/31/2020,1/21/2017,1/23/2017,Gallup,B,1500.0,a,0.242845,...,46.0,45.765945,44.590621,,T,http://www.gallup.com/poll/201617/gallup-daily...,49262,77274,1/24/2017,8/31/2020 18:23
4,Donald Trump,All polls,8/31/2020,1/22/2017,1/24/2017,Rasmussen Reports/Pulse Opinion Research,C+,1500.0,lv,0.200411,...,43.0,51.601406,44.437696,,T,http://www.rasmussenreports.com/public_content...,49266,77278,1/25/2017,8/31/2020 18:23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14542,Donald Trump,Voters,8/31/2020,8/27/2020,8/28/2020,YouGov,B,807.0,rv,0.182970,...,54.0,42.362936,53.353449,,,https://docs.cdn.yougov.com/trcdohan8j/2020082...,68246,127845,8/29/2020,8/31/2020 18:27
14543,Donald Trump,Voters,8/31/2020,8/26/2020,8/30/2020,Rasmussen Reports/Pulse Opinion Research,C+,1500.0,lv,0.507394,...,51.0,42.803609,51.794244,,T,http://www.rasmussenreports.com/public_content...,68257,127866,8/31/2020,8/31/2020 18:27
14544,Donald Trump,Voters,8/31/2020,8/27/2020,8/29/2020,YouGov,B,789.0,rv,0.178889,...,53.0,45.362936,52.353449,,,https://today.yougov.com/_pubapis/v5/us/tracke...,68260,127872,8/31/2020,8/31/2020 18:27
14545,Donald Trump,Voters,8/31/2020,8/28/2020,8/30/2020,YouGov,B,764.0,rv,0.189977,...,53.0,44.362936,52.353449,,,https://today.yougov.com/_pubapis/v5/us/tracke...,68272,127889,8/31/2020,8/31/2020 18:27


In [3]:
# Reorder the columns to match the column order of the other polling dataframe
presidential_approval = df[['pollster', 'startdate', 'enddate', 'approve', 'disapprove', 'grade', 
                            'samplesize', 'population', 'weight']]

# Rename columns to match the column names of the other polling dataset
presidential_approval = presidential_approval.rename(columns={"pollster":"Poll", "startdate":"Start_Date", "enddate":"End_Date", "approve":"Approve", 
                         "disapprove":"Disapprove", "grade":"Grade", "samplesize":"Sample", 
                         "population":"Population", "weight":"Weight"})

In [4]:
# Apply uppercase to the 'Population' column to be consistent with the formatting of the other polling dataset
presidential_approval['Population'] = presidential_approval['Population'].str.upper()

# Drop any null value rows
presidential_approval.dropna(inplace=True)

# Drop duplicate rows in 'Start_Date' column
pres = presidential_approval.drop_duplicates(subset=['Start_Date'])

In [5]:
# Display the data types of all variables
pres.dtypes

Poll           object
Start_Date     object
End_Date       object
Approve       float64
Disapprove    float64
Grade          object
Sample        float64
Population     object
Weight        float64
dtype: object

In [6]:
# Alter 'Start_Date' & 'End_Date' columns to be in datetime format, again for consistent formatting
pres['Start_Date'] = pd.to_datetime(pres['Start_Date'].str.strip(), format = '%m/%d/%Y')
pres['End_Date'] = pd.to_datetime(pres['End_Date'].str.strip(), format = '%m/%d/%Y')

In [7]:
# Set the index to Start_Date to serve as a primary key in the SQL database
pres_approval = pres.set_index('Start_Date')

# Reorder the columns to match the column order of the other polling dataframe
pres_approval = pres_approval[['End_Date', 'Poll', 'Approve', 'Disapprove', 'Grade', 'Sample', 'Population', 'Weight']]

# View the cleaned-up dataframe
pres_approval

Unnamed: 0_level_0,End_Date,Poll,Approve,Disapprove,Grade,Sample,Population,Weight
Start_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-20,2017-01-22,Gallup,45.00,45.0,B,1500.0,A,0.262323
2017-01-21,2017-01-23,Gallup,45.00,46.0,B,1500.0,A,0.242845
2017-01-22,2017-01-24,Rasmussen Reports/Pulse Opinion Research,57.00,43.0,C+,1500.0,LV,0.200411
2017-01-23,2017-01-24,Public Policy Polling,44.00,44.0,B,1043.0,RV,1.116294
2017-01-24,2017-01-26,Rasmussen Reports/Pulse Opinion Research,55.00,45.0,C+,1500.0,LV,0.173855
...,...,...,...,...,...,...,...,...
2020-08-25,2020-08-27,YouGov,45.00,51.0,B,1000.0,A,0.156462
2020-08-26,2020-08-28,YouGov,41.00,54.0,B,1000.0,A,0.167910
2020-08-27,2020-08-28,YouGov,40.00,55.0,B,1001.0,A,0.181665
2020-08-28,2020-08-30,YouGov,42.00,52.0,B,1000.0,A,0.199854


In [8]:
# Save to a new csv file to be imported into the SQL database
pres_approval.to_csv('Output_data/presidential_approval.csv')