In [1]:
# GOALS

## Import your data into a Pandas DataFrame.
## Merge your DataFrames.
## Create a bubble chart that showcases the average fare versus the total number of rides with bubble size based on the total number of drivers for each city type, including urban, suburban, and rural.
## Determine the mean, median, and mode for the following:
## The total number of rides for each city type.
## The average fares for each city type.
## The total number of drivers for each city type.
## Create box-and-whisker plots that visualize each of the following to determine if there are any outliers:
## The number of rides for each city type.
## The fares for each city type.
## The number of drivers for each city type.
## Create a pie chart that visualizes each of the following data for each city type:
## The percent of total fares.
## The percent of total rides.
## The percent of total drivers.

In [2]:
# Add Matplotlib inline magic command
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import statistics
import numpy as np

In [3]:
# Files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [4]:
# Read the city data file and store it in a pandas DataFrame.
city_data_df = pd.read_csv(city_data_to_load)
city_data_df.head(10)

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban
5,West Anthony,70,Urban
6,West Angela,48,Urban
7,Martinezhaven,25,Urban
8,Karenberg,22,Urban
9,Barajasview,26,Urban


In [5]:
# Read the ride data file and store it in a pandas DataFrame.
ride_data_df = pd.read_csv(ride_data_to_load)
ride_data_df.head(10)

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344
5,South Latoya,2019-03-11 12:26:48,9.52,1994999424437
6,New Paulville,2019-02-27 11:17:56,43.25,793208410091
7,Simpsonburgh,2019-04-26 00:43:24,35.98,111953927754
8,South Karenland,2019-01-08 03:28:48,35.09,7995623208694
9,North Jasmine,2019-03-09 06:26:29,42.81,5327642267789


In [6]:
# cleaning city data

## Get all the rows that contain null values.
### reveals there are 120 rows
city_data_df.count()
### reveals there are 0 null rows
city_data_df.isnull().sum()

## Make sure the driver_count column has an integer data type.
city_data_df.dtypes

## Find out how many data points there are for each type of city.

# set a count value to 0, an empty list, and empty dictionary
city_type_count=0
city_type_list=[]
city_type_dict={}

# start for loop referencing the 'type' value from city_data_df
for city_type in city_data_df['type']:
    # if statement to append new city types to list and start their dictionary value at 0
    if city_type not in city_type_list:
        city_type_list.append(city_type)
        city_type_dict[city_type] = 0
    # add one to the dictionary value for that city type when it appears
    city_type_dict[city_type] += 1

# display results
city_type_dict

{'Urban': 66, 'Suburban': 36, 'Rural': 18}

In [7]:
# cleaning ride data

# # Get all the rows that contain null values.
### reveals there are 2375 rows
ride_data_df.count()
### reveals there are 0 null rows
ride_data_df.isnull().sum()

# # Make sure the fare and ride_id columns are numerical data types.
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id      int64
dtype: object

In [8]:
#combine data into a single dataset
pyber_data_df=pd.merge(ride_data_df,city_data_df,on='city')
pyber_data_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,Lake Jonathanshire,2019-04-07 20:51:11,31.25,4441251834598,5,Urban
2,Lake Jonathanshire,2019-03-09 23:45:55,19.89,2389495660448,5,Urban
3,Lake Jonathanshire,2019-04-07 18:09:21,24.28,7796805191168,5,Urban
4,Lake Jonathanshire,2019-01-02 14:14:50,13.89,424254840012,5,Urban


In [9]:
# verify no new null values popped up
pyber_data_df.isnull().sum()

city            0
date            0
fare            0
ride_id         0
driver_count    0
type            0
dtype: int64

In [10]:
# For the bubble chart, we will need to plot the following:


# The average fare for each type of city on the y-axis

# The total number of rides for each type of city on the x-axis

# Make the size of each marker, or bubble, correlate to the average 
# number of drivers for each type of city

In [52]:
# Create the Urban city DataFrame.
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]
rural_cities_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
2250,Randallchester,2019-04-13 11:13:31,43.22,1076079536213,9,Rural
2251,Randallchester,2019-02-19 03:52:47,58.55,8004803682564,9,Rural
2252,Randallchester,2019-02-11 05:42:29,25.78,9010611749008,9,Rural
2253,Randallchester,2019-03-25 13:36:46,10.37,3216382725494,9,Rural
2254,Randallchester,2019-04-07 23:42:07,10.79,1615474447641,9,Rural


In [64]:
# average fare + ride count + average drivers for each urban city
urban_avg_fare = urban_cities_df.groupby(['city']).mean()['fare']
urban_ride_count = urban_cities_df.groupby(['city']).count()['ride_id']
urban_driver_count = urban_cities_df.groupby(['city']).mean()['driver_count']

# average fare + ride count + average drivers for each suburban city
suburban_avg_fare = suburban_cities_df.groupby(['city']).mean()['fare']
suburban_ride_count = suburban_cities_df.groupby(['city']).count()['ride_id']
suburban_driver_count = suburban_cities_df.groupby(['city']).mean()['driver_count']

# average fare + ride count + average drivers for each rural city
rural_avg_fare = rural_cities_df.groupby(['city']).mean()['fare']
rural_ride_count = rural_cities_df.groupby(['city']).count()['ride_id']
rural_driver_count = rural_cities_df.groupby(['city']).mean()['driver_count']

# verify 2375 ride count
urban_ride_count.sum()+suburban_ride_count.sum()+rural_ride_count.sum()

# verify means
rural_driver_count

city
Bradshawfurt         7.0
Garzaport            7.0
Harringtonfort       4.0
Jessicaport          1.0
Lake Jamie           4.0
Lake Latoyabury      2.0
Michaelberg          6.0
New Ryantown         2.0
Newtonview           1.0
North Holly          8.0
North Jaime          1.0
Penaborough          6.0
Randallchester       9.0
South Jennifer       7.0
South Marychester    1.0
South Saramouth      7.0
Taylorhaven          1.0
West Heather         4.0
Name: driver_count, dtype: float64

In [114]:
# MY CODE TO MAKE SCATTERPLOT

# # highest_ride_counts=[max(urban_ride_count),max(suburban_ride_count),max(rural_ride_count)]
# # highest_avg_fares=[max(urban_avg_fare),max(suburban_avg_fare),max(rural_avg_fare)]

# # lowest_ride_counts=[min(urban_ride_count),min(suburban_ride_count),min(rural_ride_count)]
# # lowest_avg_fares=[min(urban_avg_fare),min(suburban_avg_fare),min(rural_avg_fare)]

# # max_ride_count = max(highest_ride_counts)
# # max_avg_fare = round(max(highest_avg_fares))
# # min_ride_count = min(lowest_ride_counts)
# # min_avg_fare = round(min(lowest_avg_fares))
# # min_avg_fare




# # y_tick_intervals=np.arange(min_avg_fare-5,max_avg_fare+5,5)





# # plt.subplots(figsize=(8,8))

# # plt.scatter(urban_ride_count,urban_avg_fare,label='Urban', color='orangered', edgecolors='k', s=[5*i for i in urban_driver_count])
# # plt.scatter(suburban_ride_count,suburban_avg_fare,label='Suburban',color='skyblue',edgecolors='k',s=[5*i for i in suburban_driver_count])
# # plt.scatter(rural_ride_count,rural_avg_fare,label='Rural',color='gold',edgecolors='k',s=[5*i for i in rural_driver_count])

# # plt.xlim(min_ride_count-5,max_ride_count+5)
# # plt.ylim(min_avg_fare-5,max_avg_fare+5)

# # plt.yticks(y_tick_intervals)

# # plt.xlabel('Total Number of Rides (Per City)')
# # plt.ylabel('Average Fare ($)')


# # plt.grid()
# # plt.legend()


# # plt.show()