In [1]:
# Dependencies
import pandas as pd
import sqlite3

In [8]:
# Create connections to database
conn = sqlite3.connect("Data/Hotels.db")

#Load the database table into a pandas dataframe
popndf = pd.read_sql_query("select m.name, m.province, r.reviews_rating from metadata as m join ratings as r on m.name = r.Name;", conn)
conn.close()

# Preview the dataframe
popndf.head()

Unnamed: 0,name,province,reviews_rating
0,Rancho Valencia Resort Spa,CA,5.0
1,Rancho Valencia Resort Spa,CA,5.0
2,Rancho Valencia Resort Spa,CA,5.0
3,Aloft Arundel Mills,MD,2.0
4,Aloft Arundel Mills,MD,5.0


In [18]:
# Create a dictionary of state names and their abbreviations
state_abbr = sorted(list(set(popndf["province"])))
state_name = ["Alaska", "Arkansas", "Arizona", "California", "Colorado", "Connecticut", "Delaware", "Florida",
              "Georgia", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky",
              "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri",
              "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire",
              "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", 
              "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas",
              "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

state_dict = dict(zip(state_abbr, state_name))
# Add a column containing state names
popndf["State"] = popndf["province"].map(state_dict)
popndf.head()

Unnamed: 0,name,province,reviews_rating,State
0,Rancho Valencia Resort Spa,CA,5.0,California
1,Rancho Valencia Resort Spa,CA,5.0,California
2,Rancho Valencia Resort Spa,CA,5.0,California
3,Aloft Arundel Mills,MD,2.0,Maryland
4,Aloft Arundel Mills,MD,5.0,Maryland


In [9]:
# Load the csv file in a dataframe
path = "Data/State_population_data.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,POPESTIMATE2018,POPEST18PLUS2018,PCNT_POPEST18PLUS
0,10,0,0,0,United States,327167434,253768092,77.6
1,40,3,6,1,Alabama,4887871,3798031,77.7
2,40,4,9,2,Alaska,737438,553622,75.1
3,40,4,8,4,Arizona,7171646,5528989,77.1
4,40,3,7,5,Arkansas,3013825,2310645,76.7


In [13]:
# Selecting only the columns that will be used in the analysis
df1 = df[['NAME','POPEST18PLUS2018']]

# Preview the dataframe
df1.head()

Unnamed: 0,NAME,POPEST18PLUS2018
0,United States,253768092
1,Alabama,3798031
2,Alaska,553622
3,Arizona,5528989
4,Arkansas,2310645


In [19]:
# Renaming the column name
df2 = df1[['NAME','POPEST18PLUS2018']]
df2.columns = ['State', 'Popn']
df2.head()

Unnamed: 0,State,Popn
0,United States,253768092
1,Alabama,3798031
2,Alaska,553622
3,Arizona,5528989
4,Arkansas,2310645


In [21]:
# merging common columns
df_merge_col = pd.merge(popndf, df2)

df_merge_col

Unnamed: 0,name,province,reviews_rating,State,Popn
0,Rancho Valencia Resort Spa,CA,5.00,California,30567090
1,Rancho Valencia Resort Spa,CA,5.00,California,30567090
2,Rancho Valencia Resort Spa,CA,5.00,California,30567090
3,Hotel Zelos,CA,3.00,California,30567090
4,Hotel Zelos,CA,4.00,California,30567090
5,Hotel Zelos,CA,5.00,California,30567090
6,Hotel Zelos,CA,5.00,California,30567090
7,Fairmont Grand Del Mar,CA,4.00,California,30567090
8,Fairmont Grand Del Mar,CA,4.00,California,30567090
9,Fairmont Grand Del Mar,CA,5.00,California,30567090


In [25]:
# grouping by population and State
df_merge_col.groupby(['Popn', 'State']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,province,reviews_rating
Popn,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
442962,Wyoming,114,114,114
510326,Vermont,14,14,14
553622,Alaska,21,21,21
581379,North Dakota,18,18,18
664629,South Dakota,121,121,121
763555,Delaware,56,56,56
832871,Montana,144,144,144
852102,Rhode Island,5,5,5
1088000,Maine,138,138,138
1098288,New Hampshire,123,123,123
