In [116]:
# Add Matplotlib inline magic command
%matplotlib notebook
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import os

In [117]:
# Files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [118]:
# Read the city data file and store it in a pandas DataFrame.
city_data_df = pd.read_csv(city_data_to_load)
city_data_df.head(10)

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban
5,West Anthony,70,Urban
6,West Angela,48,Urban
7,Martinezhaven,25,Urban
8,Karenberg,22,Urban
9,Barajasview,26,Urban


In [119]:
city_data_df.isnull().count()

city            120
driver_count    120
type            120
dtype: int64

In [120]:
city_data_df.dtypes

city            object
driver_count     int64
type            object
dtype: object

In [121]:
sum(city_data_df["type"] == "Suburban")

36

In [122]:
# Get the columns and the rows that are not null.
ride_data_df = pd.read_csv(ride_data_to_load)
ride_data_df.isnull().sum()

city       0
date       0
fare       0
ride_id    0
dtype: int64

In [123]:
# Get the columns and the rows that are not null.
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id      int64
dtype: object

In [124]:
pyber_data_df = pd.merge(ride_data_df, city_data_df, on=["city","city"])
pyber_data_df.head()
merged_df = pd.merge(city_data_df, ride_data_df, on=["city","city"])
merged_df.count()
count_by_city_type = pyber_data_df.groupby(merged_df["type"]).sum()["driver_count"]
count_by_city_type
ride_data_df.groupby(["city"]).count()["ride_id"]

city
Amandaburgh         18
Barajasview         22
Barronchester       16
Bethanyland         18
Bradshawfurt        10
                    ..
West Robert         31
West Samuelburgh    25
Williamsonville     14
Williamsstad        23
Williamsview        20
Name: ride_id, Length: 120, dtype: int64

In [125]:
urban_average = pyber_data_df.loc[pyber_data_df["type"] == "Urban"]["fare"].mean()
#urban_average = urban_df["fare"].mean()
urban_average
urban_count = pyber_data_df.loc[pyber_data_df["type"] == "Urban"]["ride_id"].count()
urban_count
suburban_average = pyber_data_df.loc[pyber_data_df["type"] == "Suburban"]["fare"].mean()
#urban_average = urban_df["fare"].mean()
suburban_average
suburban_count = pyber_data_df.loc[pyber_data_df["type"] == "Suburban"]["ride_id"].count()
suburban_count
rural_average = pyber_data_df.loc[pyber_data_df["type"] == "Rural"]["fare"].mean()
#urban_average = urban_df["fare"].mean()
rural_average
rural_count = pyber_data_df.loc[pyber_data_df["type"] == "Rural"]["ride_id"].count()
rural_count

125

In [126]:
y_axis = [urban_average, rural_average, suburban_average]
y_axis

[24.525772307692307, 34.62344, 30.970127999999992]

In [127]:
x_axis = [urban_count, rural_count, suburban_count]
x_axis

[1625, 125, 625]

In [128]:
# Create the Urban city DataFrame.
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
urban_cities_df.head()
# Create the Suburban and Rural city DataFrames.
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]


In [129]:
# Get the number of rides for urban cities.
urban_ride_count = urban_cities_df.groupby(["city"]).count()["ride_id"]
urban_ride_count.head()

city
Amandaburgh        18
Barajasview        22
Carriemouth        27
Christopherfurt    27
Deanville          19
Name: ride_id, dtype: int64

In [130]:
suburban_ride_count = suburban_cities_df.groupby(["city"]).count()["ride_id"]
rural_ride_count = rural_cities_df.groupby(["city"]).count()["ride_id"]

In [131]:
# Get average fare for each city in the urban cities.
urban_avg_fare = urban_cities_df.groupby(["city"]).mean()["fare"]
urban_avg_fare.head()

city
Amandaburgh        24.641667
Barajasview        25.332273
Carriemouth        28.314444
Christopherfurt    24.501852
Deanville          25.842632
Name: fare, dtype: float64

In [132]:
# Get average fare for each city in the suburban and rural cities.
suburban_avg_fare = suburban_cities_df.groupby(["city"]).mean()["fare"]
rural_avg_fare = rural_cities_df.groupby(["city"]).mean()["fare"]

In [133]:
# Get average number of frivers for each city in the suburban and rural cities.
suburban_driver_count = suburban_cities_df.groupby(["city"]).mean()["driver_count"]
rural_driver_count = rural_cities_df.groupby(["city"]).mean()["driver_count"]
# Get the average number of drivers for each urban city.
urban_driver_count = urban_cities_df.groupby(["city"]).mean()["driver_count"]
urban_driver_count.head()

city
Amandaburgh        12.0
Barajasview        26.0
Carriemouth        52.0
Christopherfurt    41.0
Deanville          49.0
Name: driver_count, dtype: float64

In [134]:
plt.scatter(urban_ride_count, urban_avg_fare, s=10*urban_driver_count, 
            edgecolor = "black", label = "Urban", color="coral", alpha=0.8, linewidths=1)
plt.xlabel("Total Number of Rides(Per City)")
plt.ylabel("Average Fare($)")
plt.title("PyBer Ride-Sharing Data(2019)")
plt.grid(True)
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x1ece04097c8>

In [135]:
# Build the scatter plots for suburban cities.
plt.scatter(suburban_ride_count,
      suburban_avg_fare,
      s=10*suburban_driver_count, c="skyblue",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Suburban")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
# Add the legend.
plt.legend()

<matplotlib.legend.Legend at 0x1ece04d3688>

In [136]:
# Build the scatter plots for rural cities.
plt.scatter(rural_ride_count,
      rural_avg_fare,
      s=10*rural_driver_count, c="gold",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Rural")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
# Add the legend.
plt.legend()

<matplotlib.legend.Legend at 0x1ece0484308>

In [137]:
# Build the scatter charts for each city type.
plt.subplots(figsize=(10, 6))
plt.scatter(urban_ride_count,
      urban_avg_fare,
      s=10*urban_driver_count, c="coral",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Urban")

plt.scatter(suburban_ride_count,
      suburban_avg_fare,
      s=10*suburban_driver_count, c="skyblue",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Suburban")

plt.scatter(rural_ride_count,
      rural_avg_fare,
      s=10*rural_driver_count, c="gold",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Rural")

# Incorporate the other graph properties
plt.title("PyBer Ride-Sharing Data (2019)", fontsize=20)
plt.ylabel("Average Fare ($)", fontsize=12)
plt.xlabel("Total Number of Rides (Per City)", fontsize=12)
plt.grid(True)

# Add the legend.
# Create a legend
lgnd = plt.legend(fontsize="12", mode="Expanded",
         scatterpoints=1, loc="best", title="City Types")
lgnd.legendHandles[0]._sizes = [75]
lgnd.legendHandles[1]._sizes = [75]
lgnd.legendHandles[2]._sizes = [75]
lgnd.get_title().set_fontsize(12)

# Incorporate a text label about circle size.
plt.text(42, 35, "Note: Circle size correlates with driver count per city.", fontsize="12")

# Save the figure.
plt.savefig("analysis/Fig1.png")

# Show the plot
plt.show()

<IPython.core.display.Javascript object>

In [138]:
# Get summary statistics.
urban_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,1625.0,1625.0,1625.0
mean,24.525772,4873485000000.0,36.678154
std,11.738649,2907440000000.0,20.075545
min,4.05,14588100000.0,3.0
25%,14.55,2400244000000.0,22.0
50%,24.64,4711188000000.0,37.0
75%,34.58,7451579000000.0,52.0
max,44.97,9991538000000.0,73.0


In [139]:
suburban_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,625.0,625.0,625.0
mean,30.970128,4971015000000.0,13.712
std,10.677508,2912410000000.0,8.042372
min,12.05,321833700.0,1.0
25%,21.97,2364253000000.0,5.0
50%,30.75,5053221000000.0,16.0
75%,39.83,7551674000000.0,21.0
max,49.96,9917744000000.0,25.0


In [140]:
rural_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,125.0,125.0,125.0
mean,34.62344,4647746000000.0,4.296
std,14.558046,2887834000000.0,2.691156
min,10.11,26848730000.0,1.0
25%,19.9,2275364000000.0,1.0
50%,37.05,4023962000000.0,4.0
75%,47.07,7118047000000.0,7.0
max,58.55,9990581000000.0,9.0


In [141]:
# Get summary statistics.
urban_ride_count.describe()

count    66.000000
mean     24.621212
std       5.408726
min      12.000000
25%      21.000000
50%      24.000000
75%      28.000000
max      39.000000
Name: ride_id, dtype: float64

In [142]:
rural_ride_count.describe()

count    18.000000
mean      6.944444
std       2.508157
min       3.000000
25%       5.250000
50%       6.000000
75%       8.750000
max      12.000000
Name: ride_id, dtype: float64

In [143]:
suburban_ride_count.describe()

count    36.000000
mean     17.361111
std       4.323707
min       9.000000
25%      14.000000
50%      17.000000
75%      19.250000
max      27.000000
Name: ride_id, dtype: float64

In [144]:
# Calculate the mean of the ride count for each city type.
round(urban_ride_count.mean(),2), round(suburban_ride_count.mean(),2), round(rural_ride_count.mean(),2)

(24.62, 17.36, 6.94)

In [145]:
round(urban_ride_count.median(),2), round(suburban_ride_count.median(),2), round(rural_ride_count.median(),2)

(24.0, 17.0, 6.0)

In [146]:
urban_ride_count.mode()

0    22
1    25
dtype: int64

In [147]:
rural_ride_count.mode()

0    6
dtype: int64

In [148]:
suburban_ride_count.mode()

0    17
dtype: int64

In [149]:
# Import NumPy and the stats module from SciPy.
import numpy as np
import scipy.stats as sts

In [152]:
# Calculate the measures of central tendency for the ride count for the urban cities.
mean_urban_ride_count = np.mean(rural_ride_count)
print(f"The mean for the ride counts for urban trips is {mean_urban_ride_count:.2f}.")

median_urban_ride_count = np.median(rural_ride_count)
print(f"The median for the ride counts for urban trips is {median_urban_ride_count}.")

mode_urban_ride_count = sts.mode(rural_ride_count)
print(f"The mode for the ride counts for urban trips is {mode_urban_ride_count}.")

The mean for the ride counts for urban trips is 6.94.
The median for the ride counts for urban trips is 6.0.
The mode for the ride counts for urban trips is ModeResult(mode=array([6], dtype=int64), count=array([5])).


In [162]:
# Get the fares for the urban cities.
urban_fares = urban_cities_df["fare"]
rural_fares = rural_cities_df["fare"]
suburban_fares = suburban_cities_df["fare"]
urban_fares.head()

0    13.83
1    31.25
2    19.89
3    24.28
4    13.89
Name: fare, dtype: float64

In [163]:
urban_fares.mean()
urban_fares.median()
urban_fares.mode()
sts.mode(urban_fares)

ModeResult(mode=array([22.86]), count=array([5]))

In [165]:
# Calculate the measures of central tendency for the average fare for the urban cities.
mean_urban_fares = np.mean(suburban_fares)
print(f"The mean fare price for urban trips is ${mean_urban_fares:.2f}.")

median_urban_fares = np.median(urban_fares)
print(f"The median fare price for urban trips is ${median_urban_fares:.2f}.")

mode_urban_fares = sts.mode(urban_fares)
print(f"The mode fare price for urban trips is {mode_urban_fares}.")

The mean fare price for urban trips is $30.97.
The median fare price for urban trips is $24.64.
The mode fare price for urban trips is ModeResult(mode=array([22.86]), count=array([5])).


In [173]:
# Get the driver count data from the urban cities.
urban_drivers = urban_cities_df['driver_count']

suburban_drivers = suburban_cities_df['driver_count']

rural_drivers = rural_cities_df['driver_count']


In [168]:
rural_driver_count = rural_cities_df["driver_count"].sum()
rural_driver_count 

537

In [169]:
suburban_driver_count = suburban_cities_df["driver_count"].sum()
suburban_driver_count 

8570

In [None]:
driver_count_df = 