In [1]:
#Add Matplotlib inline magic command
%matplotlib inline
#Dependencies and setup
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [3]:
#Read the city data file and store it in a pandas DataFrame.
city_data_df = pd.read_csv(city_data_to_load)

In [4]:
#Read the ride data file and store it in a pandas DataFrame.
ride_data_df = pd.read_csv(ride_data_to_load)

In [5]:
#Get the unique values of the type of city.
city_data_df["type"].unique()

array(['Urban', 'Suburban', 'Rural'], dtype=object)

In [6]:
#get the number of data points from the Urban cities.
sum(city_data_df["type"]== "Urban")

66

In [7]:
#get the number of data points from the subUrban cities.
sum(city_data_df["type"]== "Suburban")

36

In [8]:
#get the number of data points from the rural cities.
sum(city_data_df["type"]== "Rural")

18

In [9]:
#get the types of each column
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id    float64
dtype: object

# Here we begin the merge to create pyber Data Frame

In [10]:
#Combine the data into a single data set using left join.
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on=["city","city"])
#display the DataFrame
pyber_data_df

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,1/14/2019 10:14,13.83,5.739410e+12,5,Urban
1,South Michelleport,3/4/2019 18:24,30.24,2.343910e+12,72,Urban
2,Port Samanthamouth,2/24/2019 4:29,33.44,2.005070e+12,57,Urban
3,Rodneyfort,2/10/2019 23:22,23.44,5.149250e+12,34,Urban
4,South Jack,3/6/2019 4:28,34.58,3.908450e+12,46,Urban
...,...,...,...,...,...,...
2370,Michaelberg,4/29/2019 17:04,13.38,8.550370e+12,6,Rural
2371,Lake Latoyabury,1/30/2019 0:05,20.76,9.018730e+12,2,Rural
2372,North Jaime,2/10/2019 21:03,11.11,2.781340e+12,1,Rural
2373,West Heather,5/7/2019 19:22,44.94,4.256850e+12,4,Rural


# Here we start the real analysis with new DataFrames

In [11]:
#create the urban, suburban and rural DataFrames.
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]


In [12]:
urban_total_fare = urban_cities_df.groupby(["city"]).sum()["fare"]
urban_total_fare

city
Amandaburgh            443.55
Barajasview            557.31
Carriemouth            764.49
Christopherfurt        661.55
Deanville              491.01
                        ...  
West Patrickchester    451.73
West Robert            778.84
West Samuelburgh       544.19
Williamsstad           560.33
Williamsview           531.98
Name: fare, Length: 66, dtype: float64

In [13]:
suburban_total_fare = suburban_cities_df.groupby(["city"]).sum()["fare"]
rural_total_fare = rural_cities_df.groupby(["city"]).sum()["fare"]

# This is the statistics section

In [14]:
#get summary statistics
urban_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,1625.0,1625.0,1625.0
mean,24.525772,4873485000000.0,36.678154
std,11.738649,2907440000000.0,20.075545
min,4.05,14588100000.0,3.0
25%,14.55,2400240000000.0,22.0
50%,24.64,4711190000000.0,37.0
75%,34.58,7451580000000.0,52.0
max,44.97,9991540000000.0,73.0


In [15]:
#get summary statistics
suburban_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,625.0,625.0,625.0
mean,30.970128,4971015000000.0,13.712
std,10.677508,2912410000000.0,8.042372
min,12.05,321833700.0,1.0
25%,21.97,2364250000000.0,5.0
50%,30.75,5053220000000.0,16.0
75%,39.83,7551670000000.0,21.0
max,49.96,9917740000000.0,25.0


In [16]:
#get summary statistics
rural_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,125.0,125.0,125.0
mean,34.62344,4647746000000.0,4.296
std,14.558046,2887834000000.0,2.691156
min,10.11,26848730000.0,1.0
25%,19.9,2275360000000.0,1.0
50%,37.05,4023960000000.0,4.0
75%,47.07,7118050000000.0,7.0
max,58.55,9990580000000.0,9.0


In [None]:
#calculate the mean of the ride count for each city type.
round(urban_ride_count.mean(),2), round(suburban_ride_count.mean(),2), round(rural_ride_count.mean(),2)

In [None]:
#calculate the median of the ride count for each city type.
round(urban_ride_count.median(),2), round(suburban_ride_count.median(),2), round(rural_ride_count.median(),2)

In [None]:
#calculate the mode of the ride count for each city type.
round(urban_ride_count.mode()), round(suburban_ride_count.mode()), round(rural_ride_count.mode())

In [None]:
suburban_ride_count.mode()

In [None]:
rural_ride_count.mode()

In [None]:
import numpy as np
import scipy.stats as sts

In [None]:
#calculate the measures of central tendency for the ride count for the urban cities using the NumPy and statistics modules
#and print them out using the f-string function
mean_urban_ride_count = np.mean(urban_ride_count)
print(f"The mean for the ride counts for the urban trips is {mean_urban_ride_count:.2f}.")

median_urban_ride_count = np.median(urban_ride_count)
print(f"The median for the ride counts for the urban trips is {median_urban_ride_count}.")

mode_urban_ride_count = sts.mode(urban_ride_count)
print(f"The mode for the ride counts for the urban trips is {mode_urban_ride_count}.")

In [None]:
#calculate the measures of central tendency for the ride count for the suburban cities
#and print them out using the f-string function
mean_suburban_ride_count = np.mean(suburban_ride_count)
print(f"The mean for the ride counts for the suburban trips is {mean_suburban_ride_count:.2f}.")

median_suburban_ride_count = np.median(suburban_ride_count)
print(f"The median for the ride counts for the suburban trips is {median_suburban_ride_count}.")

mode_suburban_ride_count = sts.mode(suburban_ride_count)
print(f"The mode for the ride counts for the suburban trips is {mode_suburban_ride_count}.")

In [None]:
#calculate the measures of central tendency for the ride count for the rural cities
#and print them out using the f-string function
mean_rural_ride_count = np.mean(rural_ride_count)
print(f"The mean for the ride counts for the rural trips is {mean_rural_ride_count:.2f}.")

median_rural_ride_count = np.median(rural_ride_count)
print(f"The median for the ride counts for the rural trips is {median_rural_ride_count}.")

mode_rural_ride_count = sts.mode(rural_ride_count)
print(f"The mode for the ride counts for the rural trips is {mode_rural_ride_count}.")

# Summary Statistics for Fare by City Type

In [None]:
# Get the fares for the urban cities.
urban_fares = urban_cities_df["fare"]

In [None]:
#Get the fares for suburban cities
suburban_fares = suburban_cities_df["fare"]

In [None]:
#Get the fares for rural cities--this is a series we created
rural_fares = rural_cities_df["fare"]

In [None]:
#calculate the mean of the fare for each city type.
round(urban_fares.mean(),2), round(urban_fares.mean(),2), round(rural_fares.mean(),2)

In [None]:
# Calculate the measures of central tendency for the average fare for the urban cities.
mean_urban_fares = np.mean(urban_fares)
print(f"The mean fare price for urban trips is ${mean_urban_fares:.2f}.")

median_urban_fares = np.median(urban_fares)
print(f"The median fare price for urban trips is ${median_urban_fares:.2f}.")

mode_urban_fares = sts.mode(urban_fares)
print(f"The mode fare price for urban trips is {mode_urban_fares}.")

In [None]:
# Get the driver count data for the urban cities
urban_drivers = urban_cities_df["driver_count"]
len(urban_drivers)

In [None]:
# Get the driver count data for the urban cities
suburban_drivers = suburban_cities_df["driver_count"]
len(suburban_drivers)

In [None]:
# Get the driver count data for the rural cities
rural_drivers = rural_cities_df["driver_count"]
len(rural_drivers)

In [None]:
# Calculate the measures of central tendency for the number of drivers for the urban cities.
mean_urban_drivers = np.mean(urban_drivers)
print(f"The mean number of drivers for urban cities is {mean_urban_drivers:.2f}.")

median_urban_drivers = np.median(urban_drivers)
print(f"The median number of drivers for urban cities is {median_urban_drivers:.2f}.")

mode_urban_drivers = sts.mode(urban_drivers)
print(f"The mode for drivers for urban cities is {mode_urban_drivers}.")

In [None]:
# Calculate the measures of central tendency for the number of drivers for the suburban cities.
mean_suburban_drivers = np.mean(suburban_drivers)
print(f"The mean number of drivers for suburban cities is {mean_suburban_drivers:.2f}.")

median_suburban_drivers = np.median(suburban_drivers)
print(f"The median number of drivers for suburban cities is {median_suburban_drivers:.2f}.")

mode_suburban_drivers = sts.mode(suburban_drivers)
print(f"The mode for drivers for suburban cities is {mode_suburban_drivers}.")

## Here are the calculations for the summary data frame

In [None]:
# Get the sum of the fares for each city type--this is correct.
sum_fares_by_type = pyber_data_df.groupby(["type"]).sum()["fare"]
sum_fares_by_type

In [None]:
# Get the sum for all of the fares--this is correct.
total_fares = pyber_data_df["fare"].sum()
total_fares

In [None]:
#Calculate the percentage of fare for each city type.
#type_percents = 100*sum_fares_by_type/ total_fares
#type_percents

In [None]:
# More advanced coding method combining the two variables into one line of code
#type_percents = 100*pyber_data_df.groupby(["type"]).sum()["fare"] /pyber_data_df["fare"].sum()
#type_percents

## Keep this section

In [None]:
#grouping by total number of rides--this is correct
total_rides_per_city_type = pyber_data_df.groupby(["type"]).count()["ride_id"]
total_rides_per_city_type

In [None]:
pyber_data_df["ride_id"].count()

In [None]:
#grouping by total drivers
#pyber_data_df.groupby(["type"]).count()["driver_count"]

In [None]:
#Recreate the city type data frames
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]

In [None]:
rural_total_rides = rural_cities_df["ride_id"].count()
suburban_total_rides = suburban_cities_df["ride_id"].count()
urban_total_rides = urban_cities_df["ride_id"].count()

In [None]:
#create a per city type series
#per_city_types = city_data_df.set_index(["city"])["type"]
#per_city_types.tail(10)

## Here is my solution to getting the total drivers column for the dataframe

In [None]:
#total_rural_drivers = rural_cities_df.groupby(["city"])["driver_count"].max()

In [None]:
total_urban_drivers = sum(urban_cities_df.groupby(["city"])["driver_count"].max())

In [None]:
total_urban_drivers

In [None]:
total_suburban_drivers = sum(suburban_cities_df.groupby(["city"])["driver_count"].max())

In [None]:
total_suburban_drivers

In [None]:
total_rural_drivers = sum(rural_cities_df.groupby(["city"])["driver_count"].max())

In [None]:
total_rural_drivers

In [None]:
# Get the sum for all of the fares--this is correct.
total_fares = pyber_data_df["fare"].sum()
total_fares

In [None]:
# Get the sum for urban fares.
total_urban_fares = urban_cities_df["fare"].sum()
total_urban_fares

In [None]:
# Get the sum for rural fares.
total_rural_fares = rural_cities_df["fare"].sum()
total_rural_fares

In [None]:
# Get the sum for rural fares.
total_suburban_fares = suburban_cities_df["fare"].sum()
total_suburban_fares

In [None]:
# Get the mean for suburban fares.
mean_suburban_fares = suburban_cities_df["fare"].mean()
mean_suburban_fares

In [None]:
# Get the mean for urban fares.
mean_urban_fares = urban_cities_df["fare"].mean()
mean_urban_fares

In [None]:
# Get the sum for rural fares.
mean_rural_fares = rural_cities_df["fare"].mean()
mean_rural_fares

In [None]:
#get the average fare per driver rural
avg_fare_driver_rural = total_rural_fares /total_rural_drivers
avg_fare_driver_rural

In [None]:
#get the average fare per driver suburban
avg_fare_driver_suburban = total_suburban_fares /total_suburban_drivers
avg_fare_driver_suburban

In [None]:
#get the average fare per driver urban
avg_fare_driver_urban = total_urban_fares /total_urban_drivers
avg_fare_driver_urban

In [None]:
#total_drivers_sum = sum(pyber_data_df.groupby(["type","city"])["driver_count"].max())

In [None]:
pyber_table_index = ["Rural", "Suburban","Urban"]

In [None]:
pyber_table_index_r =["Rural"]

In [None]:
pyber_table_index_u =["Urban"]

In [None]:
pyber_table_index_s =["Suburban"]

In [None]:
rural_df = pd.DataFrame({
    "City Type": pyber_table_index_r,
    "Total Rides": rural_total_rides,
    "Total Drivers":total_rural_drivers,
    "Total Fares": total_rural_fares,
    "Average Fare per Ride":mean_rural_fares,
    "Average Fare per Driver":avg_fare_driver_rural
    
})

In [None]:
rural_df

In [None]:
urban_df = pd.DataFrame({
    "City Type": pyber_table_index_u,
    "Total Rides": urban_total_rides,
    "Total Drivers":total_urban_drivers,
    "Total Fares": total_urban_fares,
    "Average Fare per Ride":mean_urban_fares,
    "Average Fare per Driver":avg_fare_driver_urban
    
})

In [None]:
urban_df

In [None]:
suburban_df = pd.DataFrame({
    "City Type": pyber_table_index_s,
    "Total Rides": suburban_total_rides,
    "Total Drivers":total_suburban_drivers,
    "Total Fares": total_suburban_fares,
    "Average Fare per Ride":mean_suburban_fares,
    "Average Fare per Driver":avg_fare_driver_suburban
    
})

In [None]:
suburban_df

In [None]:
append1_df = rural_df.append(suburban_df, ignore_index = True)

In [None]:
append1_df

In [None]:
pyBer_Summary_df = append1_df.append(urban_df, ignore_index = True)

In [None]:
pyBer_Summary_df.set_index('City Type', inplace=True)
pyBer_Summary_df

In [None]:
pyBer_Summary_df.info()

In [None]:
#Format the values of the PyBer Data Frame
pyBer_Summary_df["Total Rides"]= pyBer_Summary_df["Total Rides"].map("{:,}".format)
pyBer_Summary_df["Total Drivers"]= pyBer_Summary_df["Total Drivers"].map("{:,}".format)
pyBer_Summary_df["Total Fares"]= pyBer_Summary_df["Total Fares"].map("${:,.2f}".format)
pyBer_Summary_df["Average Fare per Ride"] = pyBer_Summary_df["Average Fare per Ride"].map("${:,.2f}".format)
pyBer_Summary_df["Average Fare per Driver"] = pyBer_Summary_df["Average Fare per Driver"].map("${:,.2f}".format)

In [None]:
pyBer_Summary_df

# This is Part 2 of the Challenge

In [None]:
#Add Matplotlib inline magic command
%matplotlib inline
#Dependencies and setup
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
#files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"