### Cleaning So. Cal Soccer Activity & Rent Data

---

Summary of Data Collection (before cleaning):
- Created spreadsheets of active soccer leagues (city-run and private) and building rent data in LA & Orange counties

---

In [1]:
import pandas as pd
import geopandas as gpd
import googlemaps
from shapely.geometry import Point
import numpy as np
import array

---

#### Cleaning Spreadsheet Data:

In [2]:
oc = pd.read_csv('OC_Soccer_Fields1.csv')

In [3]:
la = pd.read_csv('LA_Soccer_Fields1.csv')

In [4]:
oc.head(1)

Unnamed: 0,City,City_Address,Meetup,Private League1,Private League2,Avg Rent - Office,Avg Rent - Industrial,Avg Rent - Retail
0,Aliso Viejo,No,,Elite Soccer League; 26895 Aliso Creek Rd b249...,,,,


In [5]:
la.head(1)

Unnamed: 0,City,City_Address,Meetup,Private League1,Private League2,Avg Rent - Office,Avg Rent - Industrial,Avg Rent - Retail
0,Agoura Hills,No,,No,,$23.23,,


In [6]:
leagues_df = pd.concat([oc, la], sort=False).reset_index(drop=True)

In [7]:
# Creating separate columns for name & address
leagues_df[['Private1', 'Private1_Address']] = leagues_df['Private League1'].str.split('; ', expand=True)
leagues_df[['Private2', 'Private2_Address']] = leagues_df['Private League2'].str.split('; ', expand=True)

In [8]:
leagues_df = leagues_df.drop(['Private League1', 'Private League2'], axis='columns').fillna('No')

In [9]:
# Changing the names of these cities due to different naming in data used in later notebooks
leagues_df.loc[leagues_df['City'].str.contains('Industry'), 'City'] = 'Industry'
leagues_df.loc[leagues_df['City'].str.contains('Palos Verdes'), 'City'] = 'Palos Verdes'
leagues_df.loc[leagues_df['City'].str.contains('La Cañada Flintridge'), 'City'] = 'La Canada Flintridge'

In [10]:
leagues_df.head(2)

Unnamed: 0,City,City_Address,Meetup,Avg Rent - Office,Avg Rent - Industrial,Avg Rent - Retail,Private1,Private1_Address,Private2,Private2_Address
0,Aliso Viejo,No,No,No,No,No,Elite Soccer League,"26895 Aliso Creek Rd b249, Aliso Viejo, CA 92656",No,No
1,Anaheim,"2271 W.Crescent Avenue, Anaheim, CA 92801",No,$20.55,$11.91,$21.82,PLA Sports,627 S. HARBOR BLVD ANAHEIM CA 92805,No,No


---

Converting Rent Cols to Floats:

In [11]:
# Since I'm removing the $ via indexing, I replaced 'No' w/a 2 digit number (in string format) so that removing the first digit would result in '0'
rent_cols = ['Avg Rent - Office', 'Avg Rent - Industrial', 'Avg Rent - Retail']

for col in rent_cols:
    leagues_df[col] = leagues_df[col].replace('No', '10').apply(lambda x:x[1:]).astype("float")  

In [12]:
leagues_df.head(2)

Unnamed: 0,City,City_Address,Meetup,Avg Rent - Office,Avg Rent - Industrial,Avg Rent - Retail,Private1,Private1_Address,Private2,Private2_Address
0,Aliso Viejo,No,No,0.0,0.0,0.0,Elite Soccer League,"26895 Aliso Creek Rd b249, Aliso Viejo, CA 92656",No,No
1,Anaheim,"2271 W.Crescent Avenue, Anaheim, CA 92801",No,20.55,11.91,21.82,PLA Sports,627 S. HARBOR BLVD ANAHEIM CA 92805,No,No


---

Adding Activity Column:

In [13]:
cities = []
activity_levels = []

for i, row in leagues_df.iterrows():
    value = 0
    if row.City_Address != 'No':
        value = 2
    if row.Meetup != 'No':
        value += 1
    cities.append(row.City)
    activity_levels.append(value)

In [14]:
leagues_df['Soccer_Activity'] = activity_levels

In [15]:
leagues_df.head(1)

Unnamed: 0,City,City_Address,Meetup,Avg Rent - Office,Avg Rent - Industrial,Avg Rent - Retail,Private1,Private1_Address,Private2,Private2_Address,Soccer_Activity
0,Aliso Viejo,No,No,0.0,0.0,0.0,Elite Soccer League,"26895 Aliso Creek Rd b249, Aliso Viejo, CA 92656",No,No,0


---

#### Adding Lat/Lon Coordinates of Private Leagues:

In [16]:
API_KEY = '(HIDDEN)'

In [17]:
googmap = googlemaps.Client(key=API_KEY)

In [18]:
private1 = leagues_df[['City', 'Private1', 'Private1_Address']][leagues_df['Private1_Address'] != 'No']
private2 = leagues_df[['City', 'Private2', 'Private2_Address']][leagues_df['Private2_Address'] != 'No']

In [19]:
dfs = [private1, private2]

In [21]:
types = ['Private1', 'Private2']
cities = []
names = []
lats = []
lons = []
counter = 0

for df in dfs:
    for i, row in df.iterrows():
        cities.append(row.City)
        names.append(row[types[counter]])
        address = types[counter] + '_Address'
        geocode = googmap.geocode(row[address])
        
        lats.append(float(geocode[0]['geometry']['location']['lat']))
        lons.append(float(geocode[0]['geometry']['location']['lng']))
    counter += 1

In [22]:
private_leagues = pd.DataFrame({'City':cities,'Name':names, 'Lat':lats, 'Lon':lons})

In [23]:
private_leagues.head()

Unnamed: 0,City,Name,Lat,Lon
0,Aliso Viejo,Elite Soccer League,33.571661,-117.725517
1,Anaheim,PLA Sports,33.826502,-117.917325
2,Brea,AC Brea,33.919624,-117.850832
3,Costa Mesa,Soccer 6,33.675877,-117.877196
4,Garden Grove,Futsal Picante,33.775555,-118.010009


In [24]:
private_leagues.to_csv('private_leagues.csv', index=False)

*This file will be used to plot private league locations as points on a map

---

In [25]:
rent_and_activity_data = leagues_df[['City', 'Avg Rent - Office', 'Avg Rent - Industrial', 'Avg Rent - Retail', 'Soccer_Activity']]

In [26]:
rent_and_activity_data.head()

Unnamed: 0,City,Avg Rent - Office,Avg Rent - Industrial,Avg Rent - Retail,Soccer_Activity
0,Aliso Viejo,0.0,0.0,0.0,0
1,Anaheim,20.55,11.91,21.82,2
2,Brea,22.87,9.59,0.0,2
3,Buena Park,0.0,0.0,0.0,0
4,Costa Mesa,23.33,0.0,27.01,1


In [27]:
len(rent_and_activity_data)

113

In [28]:
rent_and_activity_data.to_csv('rent_and_activity_data.csv', index=False)

*This file will be used in subsequent notebooks to specify which cities to extract census data on

---