# Part 1 of the Challenge

In [1]:
#Add Matplotlib inline magic command
%matplotlib inline
#Dependencies and setup
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [3]:
#Read the city data file and store it in a pandas DataFrame.
city_data_df = pd.read_csv(city_data_to_load)

In [4]:
#Read the ride data file and store it in a pandas DataFrame.
ride_data_df = pd.read_csv(ride_data_to_load)

In [5]:
#Get the unique values of the type of city.
city_data_df["type"].unique()

array(['Urban', 'Suburban', 'Rural'], dtype=object)

In [6]:
#get the number of data points from the Urban cities.
sum(city_data_df["type"]== "Urban")

66

In [7]:
#get the number of data points from the subUrban cities.
sum(city_data_df["type"]== "Suburban")

36

In [8]:
#get the number of data points from the rural cities.
sum(city_data_df["type"]== "Rural")

18

In [9]:
#get the types of each column
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id    float64
dtype: object

# Here we begin the merge to create pyber Data Frame

In [10]:
#Combine the data into a single data set using left join.
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on=["city","city"])
#display the DataFrame
pyber_data_df

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,1/14/2019 10:14,13.83,5.739410e+12,5,Urban
1,South Michelleport,3/4/2019 18:24,30.24,2.343910e+12,72,Urban
2,Port Samanthamouth,2/24/2019 4:29,33.44,2.005070e+12,57,Urban
3,Rodneyfort,2/10/2019 23:22,23.44,5.149250e+12,34,Urban
4,South Jack,3/6/2019 4:28,34.58,3.908450e+12,46,Urban
...,...,...,...,...,...,...
2370,Michaelberg,4/29/2019 17:04,13.38,8.550370e+12,6,Rural
2371,Lake Latoyabury,1/30/2019 0:05,20.76,9.018730e+12,2,Rural
2372,North Jaime,2/10/2019 21:03,11.11,2.781340e+12,1,Rural
2373,West Heather,5/7/2019 19:22,44.94,4.256850e+12,4,Rural


# Here we start the real analysis with new DataFrames

In [11]:
#create the urban, suburban and rural DataFrames.
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]


In [12]:
#Get the number of rides for urban cities
urban_ride_count = urban_cities_df.groupby(["city"]).count()["ride_id"]
urban_ride_count.head()

city
Amandaburgh        18
Barajasview        22
Carriemouth        27
Christopherfurt    27
Deanville          19
Name: ride_id, dtype: int64

In [13]:
#create the urban, suburban and rural ride count.
suburban_ride_count = suburban_cities_df.groupby(["city"]).count()["ride_id"]
rural_ride_count = rural_cities_df.groupby(["city"]).count()["ride_id"]
urban_ride_count = urban_cities_df.groupby(["city"]).count()["ride_id"]

In [14]:
urban_avg_fare = urban_cities_df.groupby(["city"]).mean()["fare"]


In [15]:
suburban_avg_fare = suburban_cities_df.groupby(["city"]).mean()["fare"]
rural_avg_fare = rural_cities_df.groupby(["city"]).mean()["fare"]

In [16]:
#get the average number of drivers for each urban city--this works too
urban_driver_count = urban_cities_df.groupby(["city"]).mean()["driver_count"]
sum(urban_driver_count)

2405

In [17]:
suburban_driver_count = suburban_cities_df.groupby(["city"]).mean()["driver_count"]
rural_driver_count = rural_cities_df.groupby(["city"]).mean()["driver_count"]

# This is the statistics section

In [18]:
#get summary statistics
urban_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,1625.0,1625.0,1625.0
mean,24.525772,4873485000000.0,36.678154
std,11.738649,2907440000000.0,20.075545
min,4.05,14588100000.0,3.0
25%,14.55,2400240000000.0,22.0
50%,24.64,4711190000000.0,37.0
75%,34.58,7451580000000.0,52.0
max,44.97,9991540000000.0,73.0


In [19]:
#get summary statistics
suburban_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,625.0,625.0,625.0
mean,30.970128,4971015000000.0,13.712
std,10.677508,2912410000000.0,8.042372
min,12.05,321833700.0,1.0
25%,21.97,2364250000000.0,5.0
50%,30.75,5053220000000.0,16.0
75%,39.83,7551670000000.0,21.0
max,49.96,9917740000000.0,25.0


In [20]:
#get summary statistics
rural_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,125.0,125.0,125.0
mean,34.62344,4647746000000.0,4.296
std,14.558046,2887834000000.0,2.691156
min,10.11,26848730000.0,1.0
25%,19.9,2275360000000.0,1.0
50%,37.05,4023960000000.0,4.0
75%,47.07,7118050000000.0,7.0
max,58.55,9990580000000.0,9.0


In [21]:
urban_ride_count.describe()

count    66.000000
mean     24.621212
std       5.408726
min      12.000000
25%      21.000000
50%      24.000000
75%      28.000000
max      39.000000
Name: ride_id, dtype: float64

In [22]:
rural_ride_count.describe()

count    18.000000
mean      6.944444
std       2.508157
min       3.000000
25%       5.250000
50%       6.000000
75%       8.750000
max      12.000000
Name: ride_id, dtype: float64

In [23]:
suburban_ride_count.describe()

count    36.000000
mean     17.361111
std       4.323707
min       9.000000
25%      14.000000
50%      17.000000
75%      19.250000
max      27.000000
Name: ride_id, dtype: float64

In [24]:
#calculate the mean of the ride count for each city type.
round(urban_ride_count.mean(),2), round(suburban_ride_count.mean(),2), round(rural_ride_count.mean(),2)

(24.62, 17.36, 6.94)

In [25]:
#calculate the median of the ride count for each city type.
round(urban_ride_count.median(),2), round(suburban_ride_count.median(),2), round(rural_ride_count.median(),2)

(24.0, 17.0, 6.0)

In [26]:
#calculate the mode of the ride count for each city type.
round(urban_ride_count.mode()), round(suburban_ride_count.mode()), round(rural_ride_count.mode())

(0    22
 1    25
 dtype: int64, 0    17
 dtype: int64, 0    6
 dtype: int64)

In [27]:
suburban_ride_count.mode()

0    17
dtype: int64

In [28]:
rural_ride_count.mode()

0    6
dtype: int64

In [29]:
import numpy as np
import scipy.stats as sts

In [30]:
#calculate the measures of central tendency for the ride count for the urban cities using the NumPy and statistics modules
#and print them out using the f-string function
mean_urban_ride_count = np.mean(urban_ride_count)
print(f"The mean for the ride counts for the urban trips is {mean_urban_ride_count:.2f}.")

median_urban_ride_count = np.median(urban_ride_count)
print(f"The median for the ride counts for the urban trips is {median_urban_ride_count}.")

mode_urban_ride_count = sts.mode(urban_ride_count)
print(f"The mode for the ride counts for the urban trips is {mode_urban_ride_count}.")

The mean for the ride counts for the urban trips is 24.62.
The median for the ride counts for the urban trips is 24.0.
The mode for the ride counts for the urban trips is ModeResult(mode=array([22], dtype=int64), count=array([7])).


In [31]:
#calculate the measures of central tendency for the ride count for the suburban cities
#and print them out using the f-string function
mean_suburban_ride_count = np.mean(suburban_ride_count)
print(f"The mean for the ride counts for the suburban trips is {mean_suburban_ride_count:.2f}.")

median_suburban_ride_count = np.median(suburban_ride_count)
print(f"The median for the ride counts for the suburban trips is {median_suburban_ride_count}.")

mode_suburban_ride_count = sts.mode(suburban_ride_count)
print(f"The mode for the ride counts for the suburban trips is {mode_suburban_ride_count}.")

The mean for the ride counts for the suburban trips is 17.36.
The median for the ride counts for the suburban trips is 17.0.
The mode for the ride counts for the suburban trips is ModeResult(mode=array([17], dtype=int64), count=array([7])).


In [32]:
#calculate the measures of central tendency for the ride count for the rural cities
#and print them out using the f-string function
mean_rural_ride_count = np.mean(rural_ride_count)
print(f"The mean for the ride counts for the rural trips is {mean_rural_ride_count:.2f}.")

median_rural_ride_count = np.median(rural_ride_count)
print(f"The median for the ride counts for the rural trips is {median_rural_ride_count}.")

mode_rural_ride_count = sts.mode(rural_ride_count)
print(f"The mode for the ride counts for the rural trips is {mode_rural_ride_count}.")

The mean for the ride counts for the rural trips is 6.94.
The median for the ride counts for the rural trips is 6.0.
The mode for the ride counts for the rural trips is ModeResult(mode=array([6], dtype=int64), count=array([5])).


# Summary Statistics for Fare by City Type

In [33]:
# Get the fares for the urban cities.
urban_fares = urban_cities_df["fare"]

In [34]:
#Get the fares for suburban cities
suburban_fares = suburban_cities_df["fare"]

In [35]:
#Get the fares for rural cities--this is a series we created
rural_fares = rural_cities_df["fare"]

In [36]:
#calculate the mean of the fare for each city type.
round(urban_fares.mean(),2), round(urban_fares.mean(),2), round(rural_fares.mean(),2)

(24.53, 24.53, 34.62)

In [37]:
# Calculate the measures of central tendency for the average fare for the urban cities.
mean_urban_fares = np.mean(urban_fares)
print(f"The mean fare price for urban trips is ${mean_urban_fares:.2f}.")

median_urban_fares = np.median(urban_fares)
print(f"The median fare price for urban trips is ${median_urban_fares:.2f}.")

mode_urban_fares = sts.mode(urban_fares)
print(f"The mode fare price for urban trips is {mode_urban_fares}.")

The mean fare price for urban trips is $24.53.
The median fare price for urban trips is $24.64.
The mode fare price for urban trips is ModeResult(mode=array([22.86]), count=array([5])).


In [38]:
# Get the driver count data for the urban cities
urban_drivers = urban_cities_df["driver_count"]
len(urban_drivers)

1625

In [39]:
# Get the driver count data for the urban cities
suburban_drivers = suburban_cities_df["driver_count"]
len(suburban_drivers)

625

In [40]:
# Get the driver count data for the rural cities
rural_drivers = rural_cities_df["driver_count"]
len(rural_drivers)

125

In [41]:
# Calculate the measures of central tendency for the number of drivers for the urban cities.
mean_urban_drivers = np.mean(urban_drivers)
print(f"The mean number of drivers for urban cities is {mean_urban_drivers:.2f}.")

median_urban_drivers = np.median(urban_drivers)
print(f"The median number of drivers for urban cities is {median_urban_drivers:.2f}.")

mode_urban_drivers = sts.mode(urban_drivers)
print(f"The mode for drivers for urban cities is {mode_urban_drivers}.")

The mean number of drivers for urban cities is 36.68.
The median number of drivers for urban cities is 37.00.
The mode for drivers for urban cities is ModeResult(mode=array([39], dtype=int64), count=array([86])).


In [42]:
# Calculate the measures of central tendency for the number of drivers for the suburban cities.
mean_suburban_drivers = np.mean(suburban_drivers)
print(f"The mean number of drivers for suburban cities is {mean_suburban_drivers:.2f}.")

median_suburban_drivers = np.median(suburban_drivers)
print(f"The median number of drivers for suburban cities is {median_suburban_drivers:.2f}.")

mode_suburban_drivers = sts.mode(suburban_drivers)
print(f"The mode for drivers for suburban cities is {mode_suburban_drivers}.")

The mean number of drivers for suburban cities is 13.71.
The median number of drivers for suburban cities is 16.00.
The mode for drivers for suburban cities is ModeResult(mode=array([20], dtype=int64), count=array([79])).


## Here are the calculations for the summary data frame

In [43]:
# Get the sum of the fares for each city type--this is correct.
sum_fares_by_type = pyber_data_df.groupby(["type"]).sum()["fare"]
sum_fares_by_type

type
Rural        4327.93
Suburban    19356.33
Urban       39854.38
Name: fare, dtype: float64

In [44]:
# Get the sum for all of the fares--this is correct.
total_fares = pyber_data_df["fare"].sum()
total_fares

63538.64

In [45]:
#Calculate the percentage of fare for each city type.
#type_percents = 100*sum_fares_by_type/ total_fares
#type_percents

In [46]:
# More advanced coding method combining the two variables into one line of code
#type_percents = 100*pyber_data_df.groupby(["type"]).sum()["fare"] /pyber_data_df["fare"].sum()
#type_percents

## Keep this section

In [47]:
#grouping by total number of rides--this is correct
total_rides_per_city_type = pyber_data_df.groupby(["type"]).count()["ride_id"]
total_rides_per_city_type

type
Rural        125
Suburban     625
Urban       1625
Name: ride_id, dtype: int64

In [48]:
pyber_data_df["ride_id"].count()

2375

In [49]:
#grouping by total drivers
#pyber_data_df.groupby(["type"]).count()["driver_count"]

In [50]:
#Recreate the city type data frames
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]

In [51]:
rural_total_rides = rural_cities_df["ride_id"].count()
suburban_total_rides = suburban_cities_df["ride_id"].count()
urban_total_rides = urban_cities_df["ride_id"].count()

In [52]:
#create a per city type series
#per_city_types = city_data_df.set_index(["city"])["type"]
#per_city_types.tail(10)

## Here is my solution to getting the total drivers column for the dataframe

In [53]:
#total_rural_drivers = rural_cities_df.groupby(["city"])["driver_count"].max()

In [54]:
total_urban_drivers = sum(urban_cities_df.groupby(["city"])["driver_count"].max())

In [55]:
total_urban_drivers

2405

In [56]:
total_suburban_drivers = sum(suburban_cities_df.groupby(["city"])["driver_count"].max())

In [57]:
total_suburban_drivers

490

In [58]:
total_rural_drivers = sum(rural_cities_df.groupby(["city"])["driver_count"].max())

In [59]:
total_rural_drivers

78

In [60]:
# Get the sum for all of the fares--this is correct.
total_fares = pyber_data_df["fare"].sum()
total_fares

63538.64

In [61]:
# Get the sum for urban fares.
total_urban_fares = urban_cities_df["fare"].sum()
total_urban_fares

39854.380000000005

In [62]:
# Get the sum for rural fares.
total_rural_fares = rural_cities_df["fare"].sum()
total_rural_fares

4327.929999999999

In [63]:
# Get the sum for rural fares.
total_suburban_fares = suburban_cities_df["fare"].sum()
total_suburban_fares

19356.33

In [64]:
# Get the mean for suburban fares.
mean_suburban_fares = suburban_cities_df["fare"].mean()
mean_suburban_fares

30.97012800000002

In [65]:
# Get the mean for urban fares.
mean_urban_fares = urban_cities_df["fare"].mean()
mean_urban_fares

24.52577230769236

In [66]:
# Get the sum for rural fares.
mean_rural_fares = rural_cities_df["fare"].mean()
mean_rural_fares

34.623440000000016

In [67]:
#get the average fare per driver rural
avg_fare_driver_rural = total_rural_fares /total_rural_drivers
avg_fare_driver_rural

55.486282051282046

In [68]:
#get the average fare per driver suburban
avg_fare_driver_suburban = total_suburban_fares /total_suburban_drivers
avg_fare_driver_suburban

39.50271428571429

In [69]:
#get the average fare per driver urban
avg_fare_driver_urban = total_urban_fares /total_urban_drivers
avg_fare_driver_urban

16.571467775467777

In [70]:
#total_drivers_sum = sum(pyber_data_df.groupby(["type","city"])["driver_count"].max())

In [71]:
#pyber_table_index = ["Rural", "Suburban","Urban"]

In [72]:
pyber_table_index_r =["Rural"]

In [73]:
pyber_table_index_u =["Urban"]

In [74]:
pyber_table_index_s =["Suburban"]

In [75]:
rural_df = pd.DataFrame({
    "City Type": pyber_table_index_r,
    "Total Rides": rural_total_rides,
    "Total Drivers":total_rural_drivers,
    "Total Fares": total_rural_fares,
    "Average Fare per Ride":mean_rural_fares,
    "Average Fare per Driver":avg_fare_driver_rural
    
})

In [76]:
rural_df

Unnamed: 0,City Type,Total Rides,Total Drivers,Total Fares,Average Fare per Ride,Average Fare per Driver
0,Rural,125,78,4327.93,34.62344,55.486282


In [77]:
urban_df = pd.DataFrame({
    "City Type": pyber_table_index_u,
    "Total Rides": urban_total_rides,
    "Total Drivers":total_urban_drivers,
    "Total Fares": total_urban_fares,
    "Average Fare per Ride":mean_urban_fares,
    "Average Fare per Driver":avg_fare_driver_urban
    
})

In [78]:
urban_df

Unnamed: 0,City Type,Total Rides,Total Drivers,Total Fares,Average Fare per Ride,Average Fare per Driver
0,Urban,1625,2405,39854.38,24.525772,16.571468


In [79]:
suburban_df = pd.DataFrame({
    "City Type": pyber_table_index_s,
    "Total Rides": suburban_total_rides,
    "Total Drivers":total_suburban_drivers,
    "Total Fares": total_suburban_fares,
    "Average Fare per Ride":mean_suburban_fares,
    "Average Fare per Driver":avg_fare_driver_suburban
    
})

In [80]:
suburban_df

Unnamed: 0,City Type,Total Rides,Total Drivers,Total Fares,Average Fare per Ride,Average Fare per Driver
0,Suburban,625,490,19356.33,30.970128,39.502714


In [81]:
append1_df = rural_df.append(suburban_df, ignore_index = True)

In [82]:
append1_df

Unnamed: 0,City Type,Total Rides,Total Drivers,Total Fares,Average Fare per Ride,Average Fare per Driver
0,Rural,125,78,4327.93,34.62344,55.486282
1,Suburban,625,490,19356.33,30.970128,39.502714


In [83]:
pyBer_Summary_df = append1_df.append(urban_df, ignore_index = True)


In [84]:
pyBer_Summary_df.set_index('City Type', inplace=True)
pyBer_Summary_df

Unnamed: 0_level_0,Total Rides,Total Drivers,Total Fares,Average Fare per Ride,Average Fare per Driver
City Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Rural,125,78,4327.93,34.62344,55.486282
Suburban,625,490,19356.33,30.970128,39.502714
Urban,1625,2405,39854.38,24.525772,16.571468


In [85]:
pyBer_Summary_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, Rural to Urban
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Total Rides              3 non-null      int32  
 1   Total Drivers            3 non-null      int64  
 2   Total Fares              3 non-null      float64
 3   Average Fare per Ride    3 non-null      float64
 4   Average Fare per Driver  3 non-null      float64
dtypes: float64(3), int32(1), int64(1)
memory usage: 132.0+ bytes


In [86]:
#Format the values of the PyBer Data Frame
pyBer_Summary_df["Total Rides"]= pyBer_Summary_df["Total Rides"].map("{:,}".format)
pyBer_Summary_df["Total Drivers"]= pyBer_Summary_df["Total Drivers"].map("{:,}".format)
pyBer_Summary_df["Total Fares"]= pyBer_Summary_df["Total Fares"].map("${:,.2f}".format)
pyBer_Summary_df["Average Fare per Ride"] = pyBer_Summary_df["Average Fare per Ride"].map("${:,.2f}".format)
pyBer_Summary_df["Average Fare per Driver"] = pyBer_Summary_df["Average Fare per Driver"].map("${:,.2f}".format)

In [87]:
pyBer_Summary_df

Unnamed: 0_level_0,Total Rides,Total Drivers,Total Fares,Average Fare per Ride,Average Fare per Driver
City Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Rural,125,78,"$4,327.93",$34.62,$55.49
Suburban,625,490,"$19,356.33",$30.97,$39.50
Urban,1625,2405,"$39,854.38",$24.53,$16.57
