## Import libraries and Parse Web Data

Data is parsed from web html and turned into a pandas data frame.

In [44]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import plotly_express as px
import scipy.stats as st

In [None]:
# Request URL and turn html into BeautifulSoup object
url = 'https://practicum-content.s3.us-west-1.amazonaws.com/data-analyst-eng/moved_chicago_weather_2017.html'
req = requests.get(url)
html = BeautifulSoup(req.text, 'lxml')

In [9]:
# Extract the html for the table
table = html.find('table', attrs={'id':'weather_records'})

<table border="1" class="dataframe" id="weather_records">
<thead>
<tr style="text-align: right;">
<th>Date and time</th>
<th>Temperature</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>2017-11-01 00:00:00</td>
<td>276.150</td>
<td>broken clouds</td>
</tr>
<tr>
<td>2017-11-01 01:00:00</td>
<td>275.700</td>
<td>scattered clouds</td>
</tr>
<tr>
<td>2017-11-01 02:00:00</td>
<td>275.610</td>
<td>overcast clouds</td>
</tr>
<tr>
<td>2017-11-01 03:00:00</td>
<td>275.350</td>
<td>broken clouds</td>
</tr>
<tr>
<td>2017-11-01 04:00:00</td>
<td>275.240</td>
<td>broken clouds</td>
</tr>
<tr>
<td>2017-11-01 05:00:00</td>
<td>275.050</td>
<td>overcast clouds</td>
</tr>
<tr>
<td>2017-11-01 06:00:00</td>
<td>275.140</td>
<td>overcast clouds</td>
</tr>
<tr>
<td>2017-11-01 07:00:00</td>
<td>275.230</td>
<td>overcast clouds</td>
</tr>
<tr>
<td>2017-11-01 08:00:00</td>
<td>275.230</td>
<td>overcast clouds</td>
</tr>
<tr>
<td>2017-11-01 09:00:00</td>
<td>275.320</td>
<td>overcast clouds</td>
</tr>

In [14]:
# Extract variable names from the table html
variable_names = []

for row in table.find_all('th'):
    variable_names.append(row.text)

print(variable_names)

['Date and time', 'Temperature', 'Description']


In [17]:
# Extract data from the table html
data = []

for row in table.find_all('tr'):
    if not row.find_all('th'):
        data.append([element.text for element in row.find_all('td')])

[['2017-11-01 00:00:00', '276.150', 'broken clouds'], ['2017-11-01 01:00:00', '275.700', 'scattered clouds'], ['2017-11-01 02:00:00', '275.610', 'overcast clouds'], ['2017-11-01 03:00:00', '275.350', 'broken clouds'], ['2017-11-01 04:00:00', '275.240', 'broken clouds'], ['2017-11-01 05:00:00', '275.050', 'overcast clouds'], ['2017-11-01 06:00:00', '275.140', 'overcast clouds'], ['2017-11-01 07:00:00', '275.230', 'overcast clouds'], ['2017-11-01 08:00:00', '275.230', 'overcast clouds'], ['2017-11-01 09:00:00', '275.320', 'overcast clouds'], ['2017-11-01 10:00:00', '275.230', 'overcast clouds'], ['2017-11-01 11:00:00', '275.510', 'overcast clouds'], ['2017-11-01 12:00:00', '275.690', 'overcast clouds'], ['2017-11-01 13:00:00', '275.790', 'overcast clouds'], ['2017-11-01 14:00:00', '275.850', 'broken clouds'], ['2017-11-01 15:00:00', '276.600', 'broken clouds'], ['2017-11-01 16:00:00', '277.540', 'sky is clear'], ['2017-11-01 17:00:00', '278.320', 'overcast clouds'], ['2017-11-01 18:00:00

In [18]:
# Use parsed web data to create Pandas data frame
weather_records = pd.DataFrame(data, columns=variable_names)

print(weather_records)

           Date and time Temperature       Description
0    2017-11-01 00:00:00     276.150     broken clouds
1    2017-11-01 01:00:00     275.700  scattered clouds
2    2017-11-01 02:00:00     275.610   overcast clouds
3    2017-11-01 03:00:00     275.350     broken clouds
4    2017-11-01 04:00:00     275.240     broken clouds
..                   ...         ...               ...
692  2017-11-29 20:00:00     281.340        few clouds
693  2017-11-29 21:00:00     281.690      sky is clear
694  2017-11-29 22:00:00     281.070        few clouds
695  2017-11-29 23:00:00     280.060      sky is clear
696  2017-11-30 00:00:00     278.460      sky is clear

[697 rows x 3 columns]


In [26]:
# Read in data from files
company = pd.read_csv('Users/kellyshreeve/Desktop/Data-Sets/moved_project_sql_result_01.csv')
neighborhood = pd.read_csv('Users/kellyshreeve/Desktop/Data-Sets/moved_project_sql_result_04.csv')
weather = pd.read_csv('Users/kellyshreeve/Desktop/Data-Sets/moved_project_sql_result_07.csv')

In [27]:
# Print info on each csv
display(company.info())
display(neighborhood.info())
display(weather.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   company_name  64 non-null     object
 1   trips_amount  64 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   dropoff_location_name  94 non-null     object 
 1   average_trips          94 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.6+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   start_ts            1068 non-null   object 
 1   weather_conditions  1068 non-null   object 
 2   duration_seconds    1068 non-null   float64
dtypes: float64(1), object(2)
memory usage: 25.2+ KB


None

All data types are correct. There are no missing values.

In [28]:
# Print sample of rows for each data frame
display(company.head(10))
display(neighborhood.head(10))
display(weather.head(10))

Unnamed: 0,company_name,trips_amount
0,Flash Cab,19558
1,Taxi Affiliation Services,11422
2,Medallion Leasin,10367
3,Yellow Cab,9888
4,Taxi Affiliation Service Yellow,9299
5,Chicago Carriage Cab Corp,9181
6,City Service,8448
7,Sun Taxi,7701
8,Star North Management LLC,7455
9,Blue Ribbon Taxi Association Inc.,5953


Unnamed: 0,dropoff_location_name,average_trips
0,Loop,10727.466667
1,River North,9523.666667
2,Streeterville,6664.666667
3,West Loop,5163.666667
4,O'Hare,2546.9
5,Lake View,2420.966667
6,Grant Park,2068.533333
7,Museum Campus,1510.0
8,Gold Coast,1364.233333
9,Sheffield & DePaul,1259.766667


Unnamed: 0,start_ts,weather_conditions,duration_seconds
0,2017-11-25 16:00:00,Good,2410.0
1,2017-11-25 14:00:00,Good,1920.0
2,2017-11-25 12:00:00,Good,1543.0
3,2017-11-04 10:00:00,Good,2512.0
4,2017-11-11 07:00:00,Good,1440.0
5,2017-11-11 04:00:00,Good,1320.0
6,2017-11-04 16:00:00,Bad,2969.0
7,2017-11-18 11:00:00,Good,2280.0
8,2017-11-11 14:00:00,Good,2460.0
9,2017-11-11 12:00:00,Good,2040.0


## 2. Eploratory Data Analysis

### 2a. Top 10 neighborhoods for drop offs

In [43]:
# Sort neighborhoods by drop off and extract top 10
neighborhood_sorted = neighborhood.sort_values(by=['average_trips'], ascending=False).reset_index(drop=True).rename(columns={'dropoff_location_name':'dropoff_neighborhood'}).round(2)
neighborhood_top_10 = neighborhood_sorted[0:10]
print('Top 10 neighborhoods for drop offs:')
display(neighborhood_top_10)

Top 10 neighborhoods for drop offs:


Unnamed: 0,dropoff_neighborhood,average_trips
0,Loop,10727.47
1,River North,9523.67
2,Streeterville,6664.67
3,West Loop,5163.67
4,O'Hare,2546.9
5,Lake View,2420.97
6,Grant Park,2068.53
7,Museum Campus,1510.0
8,Gold Coast,1364.23
9,Sheffield & DePaul,1259.77


### 2b. Number of Rides by Taxi Company

In [None]:
# Graph number of rides by taxi company


### 2c. Number of Drop Off by Top 10 Neighborhoods

In [42]:
top_10_bar = px.bar(neighborhood_top_10, x='dropoff_neighborhood', y='average_trips',
                    title='Average Drop Offs by Top 10 Neighborhoods',
                    labels={'dropoff_neighborhood':'Drop Off Neighborhood', 
                            'average_trips':'Average Number of Trips'})

top_10_bar.update_layout({
    'plot_bgcolor':'rgba(0, 0, 0, 0)',
    'paper_bgcolor':'rgba(0, 0, 0, 0)'
})

ttop_10_bar.show()

The Loop neighborhood has the highest number of average drop offs at 10727.47 average trips, which is almost 8.5 times as many as the tenth-highest neighboorhood, Sheffield & DePaul, having 1259.77 average trips. There is a clear distinction between the four most popular neighborhoods of Loop, River North, Streeterville, and West Loop, and the rest of the neighborhoods. These four neighborhoods have more than two times as many trips as the next-leading neighborhood of O'Hare, meaning twice as many of our rides drop off in these neighborhoods as any other neighborhood. For Zuber, this means that many of our customers either live in or visit the Loop, River North, Streeterville, and West Loop neighborhoods. We should further delve into the location and customer profile of these neighborhoods to understand our local pull and customer demographic. 

## 3. Testing Hypotheses

This section uses a two-indepedent samples t-test to test the hypothesis: The average duration of rides from the Loop to O'Hare Inernational Airport changes on rainy Saturdays.

Ho: µbad = µgood  
Ha: µbad not equal µ good

alpha = 0.05

In [60]:
# Check for equality of variances
weather_bad = weather[weather['weather_conditions'] == 'Bad']['duration_seconds']
weather_good = weather[weather['weather_conditions'] == 'Good']['duration_seconds']

levene = st.levene(weather_bad, weather_good, center='mean')

W = levene[0].round(2)
p_value = levene[1].round(4)

print(f'The test statistic is: W = {W}')
print(f'The p value is: p = {p_value}')
print()

if p_value < 0.05:
    print(f'The p value of {p_value} is less than alpha.') 
    print()
    print('Reject the null hypothesis. The groups have different variances.')
else:
    print(f'The p value of {p_value} is greather than alpha.') 
    print()
    print(f'Do not reject the null hypothesis. The groups do not have different variances.')

The test statistic is: W = 0.72
The p value is: p = 0.3969

The p value of 0.3969 is greather than alpha.

Do not reject the null hypothesis. The groups do not have different variances.


In [71]:
# Two Independent Samples t-test, equal variances
t, p = st.ttest_ind(weather_bad, weather_good, equal_var=True)

mean_bad = weather_bad.mean()
mean_good = weather_good.mean()

print(f'Average trip duration good weather: {mean_good:.2f}')
print(f'Average trip duration bad weather: {mean_bad:.2f}')
print()

print(f'The test statistic is: t = {t:.2f}')
print(f'The p value is: p = {p:.4f}')
print()

if p < 0.05:
    print(f'The p value of {p:.4f} is less than alpha.') 
    print()
    print('Reject the null hypothesis. The groups have different averages.')
else:
    print(f'The p value of {p:.4f} is greather than alpha.') 
    print()
    print(f'Do not reject the null hypothesis. The groups do not have different averages.')

Average trip duration good weather: 1999.68
Average trip duration bad weather: 2427.21

The test statistic is: t = 6.95
The p value is: p = 0.0000

The p value of 0.0000 is less than alpha.

Reject the null hypothesis. The groups have different averages.


The p value of 0.00 is less than the stated alpha level of 0.05, so we find that the average trip duration during good weather is significantly different than the average trip duration during bad weather. The average trip duration during good weather (M = 1999.68) is less than the average trip duration during bad weather (M = 2427.21), so we have evidence that trips take less time on average when the weather is dry than when it is raining. 