In [1]:
# Import dependencies.
import pandas as pd
from citipy import citipy 

In [2]:
# Load csv file.
file = 'Data/housing.csv'
df = pd.read_csv(file)

print(df.shape)
df.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
# Extract coordinate.
lng = df["longitude"].values
lat = df['latitude'].values
lat_lng = zip(lat, lng)
coordinates = list(lat_lng)

In [4]:
# Create 'City' Column.
df['City'] = ""
df['City']

0         
1         
2         
3         
4         
        ..
20635     
20636     
20637     
20638     
20639     
Name: City, Length: 20640, dtype: object

In [5]:
# Identify the nearest city for each latitude and longitude combination and add to DF.
for i, coordinate in enumerate(coordinates):
    city = citipy.nearest_city(coordinate[0], coordinate[1]).city_name
    df['City'][i] = city

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['City'][i] = city


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,City
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,berkeley
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,piedmont
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,piedmont
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,berkeley
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,berkeley


In [6]:
# Count unique cities.
print(df['City'].nunique())
df['City'].unique()

462


array(['berkeley', 'piedmont', 'oakland', 'orinda', 'alameda',
       'san leandro', 'moraga', 'albany', 'el cerrito', 'castro valley',
       'ashland', 'cherryland', 'hayward', 'san lorenzo', 'fairview',
       'union city', 'fremont', 'newark', 'milpitas', 'dublin',
       'pleasanton', 'livermore', 'south lake tahoe', 'placerville',
       'cameron park', 'lodi', 'chico', 'paradise', 'magalia', 'oroville',
       'grass valley', 'yuba city', 'oakdale', 'south yuba city',
       'woodland', 'clearlake', 'oakley', 'brentwood', 'antioch',
       'pittsburg', 'bay point', 'concord', 'martinez', 'pleasant hill',
       'clayton', 'walnut creek', 'alamo', 'san ramon', 'danville',
       'lafayette', 'hercules', 'benicia', 'vallejo', 'pinole',
       'el sobrante', 'san pablo', 'richmond', 'tiburon', 'arcata',
       'grants pass', 'auburn', 'folsom', 'fresno', 'clovis', 'selma',
       'kerman', 'sanger', 'reedley', 'dinuba', 'orange cove', 'parlier',
       'kingsburg', 'lemoore', 'hanf

In [7]:
# View columms with null values.
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
City                    0
dtype: int64

In [8]:
# Drop rows with nulls values.
df = df.dropna()

In [9]:
# Verify no null values remaining.
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
City                  0
dtype: int64

In [10]:
# Check to see if there are duplicate rows.
df.duplicated().sum()

0

In [11]:
#Rename columns.
df = df.rename(columns={
    'longitude': 'Longitude',
    'latitude': 'Lattitude',
    'housing_median_age': 'Median Age',
    'total_rooms': 'Total Rooms',
    'total_bedrooms': 'Bedrooms',
    'population': 'Population',
    'households': 'Households',
    'median_income': 'Median Income',
    'median_house_value': 'Median House Value',
    'ocean_proximity': 'Ocean Proximity'
})

df.head()

Unnamed: 0,Longitude,Lattitude,Median Age,Total Rooms,Bedrooms,Population,Households,Median Income,Median House Value,Ocean Proximity,City
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,berkeley
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,piedmont
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,piedmont
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,berkeley
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,berkeley


In [12]:
# Reorder Columns
df = df[['City', 'Longitude', 'Lattitude', 'Population', 'Median Age', 'Median Income', 'Median House Value', 'Total Rooms', 'Bedrooms', 'Households', 'Ocean Proximity']]

df.head()

Unnamed: 0,City,Longitude,Lattitude,Population,Median Age,Median Income,Median House Value,Total Rooms,Bedrooms,Households,Ocean Proximity
0,berkeley,-122.23,37.88,322.0,41.0,8.3252,452600.0,880.0,129.0,126.0,NEAR BAY
1,piedmont,-122.22,37.86,2401.0,21.0,8.3014,358500.0,7099.0,1106.0,1138.0,NEAR BAY
2,piedmont,-122.24,37.85,496.0,52.0,7.2574,352100.0,1467.0,190.0,177.0,NEAR BAY
3,berkeley,-122.25,37.85,558.0,52.0,5.6431,341300.0,1274.0,235.0,219.0,NEAR BAY
4,berkeley,-122.25,37.85,565.0,52.0,3.8462,342200.0,1627.0,280.0,259.0,NEAR BAY


In [13]:
# Export the DataFrame as a CSV file
output_data_file = "Data/Clean_Housing.csv"
# Export the City_Data into a CSV.
df.to_csv(output_data_file)