### Joining Data From Previous Notebooks
(the two files containing demographic & soccer activity/rent data)

---

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

In [2]:
demographics_LA_OC = pd.read_csv('demographics_LA_OC.csv')  # Only includes LA & OC cities

In [3]:
all_rent_activity_data = pd.read_csv('all_rent_activity_data.csv')  # Contains all map cities (including those not in LA or OC)

In [4]:
demographics_LA_OC.tail(1)

Unnamed: 0,City,Median_Age,Median_Income,Normalized_Income,Male_Percent,Hispanic_Percent
112,Woodland Hills,37.0,27200.0,0.41,0.5,0.31


In [5]:
all_rent_activity_data.tail(1)

Unnamed: 0,City,Avg Rent - Office,Avg Rent - Industrial,Avg Rent - Retail,Soccer_Activity
196,Jurupa Valley,0.0,0.0,0.0,0.0


- Add rows w/ cities from all_rent_activity_data that aren't in demographics_LA_OC
- Fill city col w/ cities not in demographics_LA_OC
- Fill empty cells w/ avg. col values
- Merge w/ all_rent_activity_data (to include rent/activity data)

In [6]:
for city in demographics_LA_OC.City:
    if city not in all_rent_activity_data.City.values:
        print(city)

Palos Verdes
Woodland Hills


In [7]:
demographics_LA_OC = demographics_LA_OC[~demographics_LA_OC.City.isin(['Palos Verdes', 'Woodland Hills'])]

In [8]:
demographics_LA_OC.shape[0]

111

In [9]:
missing = []
for city in all_rent_activity_data.City:
    if city not in demographics_LA_OC.City.values:
        missing.append(city)

In [10]:
missing_cities_df = pd.DataFrame(index=range(len(missing)), columns=demographics_LA_OC.columns)

In [11]:
missing_cities_df['City'] = missing

In [12]:
missing_cities_df.head(1)

Unnamed: 0,City,Median_Age,Median_Income,Normalized_Income,Male_Percent,Hispanic_Percent
0,Palos Verdes Estates,,,,,


In [13]:
all_rent_activity_data.tail()

Unnamed: 0,City,Avg Rent - Office,Avg Rent - Industrial,Avg Rent - Retail,Soccer_Activity
192,Unincorporated,0.0,0.0,0.0,0.0
193,Palm Springs,0.0,0.0,0.0,0.0
194,Unincorporated,0.0,0.0,0.0,0.0
195,Banning,0.0,0.0,0.0,0.0
196,Jurupa Valley,0.0,0.0,0.0,0.0


In [14]:
for col in missing_cities_df.columns[1:]:
    missing_cities_df[col] = demographics_LA_OC[col].median()

In [15]:
missing_cities_df.head(1)

Unnamed: 0,City,Median_Age,Median_Income,Normalized_Income,Male_Percent,Hispanic_Percent
0,Palos Verdes Estates,39.0,23900.0,0.33,0.49,0.32


In [16]:
combined = pd.concat([demographics_LA_OC, missing_cities_df]).reset_index(drop=True)

In [17]:
combined.tail()

Unnamed: 0,City,Median_Age,Median_Income,Normalized_Income,Male_Percent,Hispanic_Percent
192,Unincorporated,39.0,23900.0,0.33,0.49,0.32
193,Palm Springs,39.0,23900.0,0.33,0.49,0.32
194,Unincorporated,39.0,23900.0,0.33,0.49,0.32
195,Banning,39.0,23900.0,0.33,0.49,0.32
196,Jurupa Valley,39.0,23900.0,0.33,0.49,0.32


In [18]:
len(combined)

197

In [19]:
len(all_rent_activity_data)

197

In [20]:
combined_all = pd.merge(combined, all_rent_activity_data.loc[:,'Avg Rent - Office':], right_index=True, left_index=True)

In [21]:
len(combined_all)

197

In [22]:
combined_all

Unnamed: 0,City,Median_Age,Median_Income,Normalized_Income,Male_Percent,Hispanic_Percent,Avg Rent - Office,Avg Rent - Industrial,Avg Rent - Retail,Soccer_Activity
0,Aliso Viejo,46.0,40000.0,0.76,0.47,0.14,0.00,0.00,0.00,0.0
1,Anaheim,34.0,20000.0,0.22,0.51,0.55,20.55,11.91,21.82,2.0
2,Brea,39.0,31800.0,0.54,0.50,0.34,22.87,9.59,0.00,2.0
3,Buena Park,41.0,24000.0,0.33,0.49,0.28,0.00,0.00,0.00,0.0
4,Costa Mesa,36.0,30000.0,0.49,0.50,0.28,23.33,0.00,27.01,1.0
5,Cypress,41.0,24000.0,0.33,0.49,0.28,0.00,0.00,0.00,0.0
6,Dana Point,48.0,35000.0,0.62,0.50,0.19,25.23,0.00,29.10,1.0
7,Fountain Valley,36.0,30000.0,0.49,0.50,0.28,19.20,0.00,0.00,2.0
8,Fullerton,35.0,23000.0,0.30,0.51,0.45,19.09,0.00,0.00,0.0
9,Garden Grove,38.0,17000.0,0.14,0.48,0.32,0.00,0.00,0.00,0.0


In [23]:
combined_all.to_csv('combined_all.csv', index=False)

*The columns in this combined df will be joined w/ the mapping shapefile in QGIS

---