In [1]:
# Import necessary libraries
import pandas as pd
from pathlib import Path

In [2]:
# Set path for clean data
col_path = Path('Clean_Data/clean_col_data.csv')
housing_path = Path('Clean_Data/clean_housing_data_city.csv')
lifestyle_path = Path('Clean_Data/clean_lifestyle_data.csv')
crime_path = Path('Clean_Data/clean_crime_data.csv')

In [3]:
# Read in CSV files
col_data = pd.read_csv(col_path)
housing_data = pd.read_csv(housing_path)
lifestyle_data = pd.read_csv(lifestyle_path)
crime_data = pd.read_csv(crime_path)

In [4]:
# Check all tables
col_data.head()

Unnamed: 0,Metropolitan/FMR Area Name,Cost of Living Index
0,"Austin, TX",106.6
1,"Boston, MA",132.6
2,"Charlotte, NC",97.9
3,"Chicago, IL",100.1
4,"Columbus, OH",93.9


In [5]:
housing_data.head()

Unnamed: 0,Metropolitan/FMR Area Name,1-Unit,2-unit,3-unit,4-unit,studio rent,1 bdrm rent,2 bdrm rent,3 bdrm rent,4 bdrm rent
0,"Austin, TX",276400.0,353400.0,428200.0,531000.0,1059,1212,1434,1848,2207
1,"Boston, MA",456400.0,583800.0,707000.0,875600.0,1742,1924,2336,2906,3168
2,"Charlotte, NC",234750.0,300000.0,363750.0,450500.0,987,1010,1151,1518,1956
3,"Chicago, IL",249166.67,318666.67,386166.67,478333.33,1012,1122,1299,1649,1969
4,"Columbus, OH",209714.29,268285.71,324571.43,402142.86,717,827,1031,1298,1468


In [6]:
lifestyle_data.head()

Unnamed: 0,Metropolitan/FMR Area Name,Apparel and Services,Men's Apparel,Women's Apparel,Children's Apparel,Footwear,Watches & Jewelry,Apparel Products and Services (1),Computers and Hardware for Home Use,Portable Memory,...,Vehicle Insurance,Life/Other Insurance,Health Insurance,Personal Care Products (18),School Books and Supplies (19),Smoking Products,Gasoline and Motor Oil,Airline Fares,Lodging on Trips,Auto/Truck Rental on Trips
0,"New York, NY",133,136,125,131,141,139,160,126,130,...,114,110,111,124,126,114,111,130,114,127
1,"Los Angeles, CA",115,119,113,115,118,108,125,122,118,...,105,99,100,111,117,93,110,123,108,119
2,"Chicago, IL",117,119,114,116,119,111,124,120,117,...,108,104,105,113,116,98,111,122,111,119
3,"Houston, TX",110,111,107,118,110,104,103,111,108,...,113,102,105,110,113,102,111,108,105,110
4,"Phoenix, AZ",103,105,101,107,102,97,98,105,102,...,106,97,101,104,106,96,104,103,100,105


In [7]:
crime_data.head()

Unnamed: 0,Metropolitan/FMR Area Name,Crime Index
0,"New York, NY",47.35
1,"Los Angeles, CA",48.89
2,"Chicago, IL",65.17
3,"Houston, TX",63.95
4,"Phoenix, AZ",51.93


In [8]:
# Sort lifestyle and crime data to match housing and col
lifestyle_data = lifestyle_data.sort_values('Metropolitan/FMR Area Name')
lifestyle_data.head()

Unnamed: 0,Metropolitan/FMR Area Name,Apparel and Services,Men's Apparel,Women's Apparel,Children's Apparel,Footwear,Watches & Jewelry,Apparel Products and Services (1),Computers and Hardware for Home Use,Portable Memory,...,Vehicle Insurance,Life/Other Insurance,Health Insurance,Personal Care Products (18),School Books and Supplies (19),Smoking Products,Gasoline and Motor Oil,Airline Fares,Lodging on Trips,Auto/Truck Rental on Trips
10,"Austin, TX",122,124,119,127,122,117,114,124,120,...,123,108,112,121,127,116,121,119,114,121
23,"Boston, MA",144,146,142,138,147,143,160,145,141,...,130,134,130,138,139,121,130,152,142,148
15,"Charlotte, NC",109,109,108,114,109,106,103,109,108,...,111,105,106,109,111,107,109,106,106,108
2,"Chicago, IL",117,119,114,116,119,111,124,120,117,...,108,104,105,113,116,98,111,122,111,119
13,"Columbus, OH",102,102,102,104,103,101,98,101,102,...,102,98,100,102,103,105,101,98,99,99


In [9]:
crime_data = crime_data.sort_values('Metropolitan/FMR Area Name')
crime_data.head()

Unnamed: 0,Metropolitan/FMR Area Name,Crime Index
10,"Austin, TX",37.19
23,"Boston, MA",34.79
15,"Charlotte, NC",44.82
2,"Chicago, IL",65.17
13,"Columbus, OH",43.79


In [10]:
# Set Metropolitan/FMR Area Name as index for all 4 tables
housing_data = housing_data.set_index('Metropolitan/FMR Area Name')
col_data = col_data.set_index('Metropolitan/FMR Area Name')
crime_data = crime_data.set_index('Metropolitan/FMR Area Name')
lifestyle_data = lifestyle_data.set_index('Metropolitan/FMR Area Name')

In [18]:
# Join the tables into one
housing_col_data = pd.merge(housing_data, col_data, on='Metropolitan/FMR Area Name', how='inner')
crime_lifestyle_data = pd.merge(crime_data, lifestyle_data, on='Metropolitan/FMR Area Name', how='inner')

In [20]:
city_data = pd.merge(housing_col_data, crime_lifestyle_data, on='Metropolitan/FMR Area Name', how='inner')
city_data

Unnamed: 0_level_0,1-Unit,2-unit,3-unit,4-unit,studio rent,1 bdrm rent,2 bdrm rent,3 bdrm rent,4 bdrm rent,Cost of Living Index,...,Vehicle Insurance,Life/Other Insurance,Health Insurance,Personal Care Products (18),School Books and Supplies (19),Smoking Products,Gasoline and Motor Oil,Airline Fares,Lodging on Trips,Auto/Truck Rental on Trips
Metropolitan/FMR Area Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Austin, TX",276400.0,353400.0,428200.0,531000.0,1059,1212,1434,1848,2207,106.6,...,123,108,112,121,127,116,121,119,114,121
"Boston, MA",456400.0,583800.0,707000.0,875600.0,1742,1924,2336,2906,3168,132.6,...,130,134,130,138,139,121,130,152,142,148
"Charlotte, NC",234750.0,300000.0,363750.0,450500.0,987,1010,1151,1518,1956,97.9,...,111,105,106,109,111,107,109,106,106,108
"Chicago, IL",249166.67,318666.67,386166.67,478333.33,1012,1122,1299,1649,1969,100.1,...,108,104,105,113,116,98,111,122,111,119
"Columbus, OH",209714.29,268285.71,324571.43,402142.86,717,827,1031,1298,1468,93.9,...,102,98,100,102,103,105,101,98,99,99
"Dallas, TX",254857.14,326285.71,394857.14,489142.86,1029,1134,1352,1746,2309,98.5,...,116,106,108,114,118,106,115,113,110,115
"Denver, CO",399000.0,510800.0,619000.0,766400.0,1179,1304,1605,2186,2486,112.1,...,124,116,117,124,127,114,122,126,122,127
"El Paso, TX",162000.0,207000.0,250000.0,310000.0,548,687,831,1174,1415,88.6,...,76,64,69,72,72,66,74,67,64,67
"Fort Worth, TX",233666.67,299000.0,361666.67,448333.33,901,1021,1242,1661,2077,100.2,...,107,100,101,105,107,101,106,101,100,103
"Houston, TX",217857.14,278571.43,337714.29,418142.86,908,983,1176,1576,2010,95.8,...,113,102,105,110,113,102,111,108,105,110


In [21]:
# Save file as CSV
city_data.to_csv('Clean_Data/city_data.csv', index=False)