In [1]:
# Import necessary libraries

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

## Data Loading and Preparation¶

### Loading Data¶

In [4]:
# load 1st dataset
df1 = pd.read_csv("/Users/Documents/Python_Project/UDEMY-UBER/Apr-Sep_Uber-data/uber-raw-data-apr14.csv")

In [None]:
# Add more months of data

df2 = pd.read_csv("/Users/karthik/Documents/Python_Project/UDEMY-UBER/Apr-Sep_Uber-data/uber-raw-data-may14.csv")
df3 = pd.read_csv("/Users/karthik/Documents/Python_Project/UDEMY-UBER/Apr-Sep_Uber-data/uber-raw-data-jun14.csv")
df4 = pd.read_csv("/Users/karthik/Documents/Python_Project/UDEMY-UBER/Apr-Sep_Uber-data/uber-raw-data-jul14.csv")
df5 = pd.read_csv("/Users/karthik/Documents/Python_Project/UDEMY-UBER/Apr-Sep_Uber-data/uber-raw-data-aug14.csv")
df6 = pd.read_csv("/Users/karthik/Documents/Python_Project/UDEMY-UBER/Apr-Sep_Uber-data/uber-raw-data-sep14.csv")

In [None]:
# Now concat all available data
df = pd.concat([df1,df2,df3,df4,df5,df6])

In [None]:
## To view stats use, 

# df.info() || df.shape ||  df.describe() ||  df.dtypes

df.head()

###  Data Preparation¶

##### Lat : The latitude of the Uber pickup

##### Lon : The longitude of the Uber pickup

##### Base : The TLC base company code affiliated with the Uber pickup

##### The globe is split into an imaginary 360 sections from both top to bottom (north to south) and 180 sections from side to side (west to east). The sections running from top to bottom on a globe are called longitude, and the sections running from side to side on a globe are called latitude.


##### Latitude is the measurement of distance north or south of the Equator.

##### Every location on earth has a global address. Because the address is in numbers, people can communicate about location no matter what language they might speak. A global address is given as two numbers called coordinates. The two numbers are a location's latitude number and its longitude number ("Lat/Long").

In [None]:
# For further analysis, split 'Date/Time' column further
# Create minute,hour,day,month,weekday name

df['day'] = df['Date/Time'].dt.day
df['month'] = df['Date/Time'].dt.month
df['hour'] = df['Date/Time'].dt.hour
df['minute'] = df['Date/Time'].dt.minute
df['weekday'] = df['Date/Time'].dt.day_name()

In [None]:
df

#### Analysis of journey by Week-days

In [None]:
# Need to install and import plotly for graphs

!pip install plotly
import plotly.express as px

In [None]:
df.weekday.value_counts()

In [None]:
# Plot the # of rides for each month

px.bar(x = df.weekday.value_counts().index,
      y = df.weekday.value_counts())

#### Analysis of journey by Hour

In [None]:
plt.hist(df['hour'])

#### It peaks during evening time when people are logging off from work

#### Analysis of journey by Month

In [None]:
for i,month in enumerate(df['month'].unique()):
    print(month)

In [None]:

plt.figure(figsize=(40,20))
for i,month in enumerate(df['month'].unique()):
    plt.subplot(3,2,i+1)
    df[df['month']==month]['hour'].hist()


#### Analysis of Rush of each hour in each month

In [None]:
for i in df['month'].unique():
    plt.figure(figsize=(5,3))
    df[df['month']==i]['hour'].hist()
    

#### Analysis of Journey of Each  Day

In [None]:
plt.figure(figsize=(10,6))
plt.hist(df['day'], bins=30, rwidth=.8, range=(0.5, 30.5))
plt.xlabel('date of the month')
plt.ylabel('Total Journeys')
plt.title('Journeys by Month Day')

In [None]:
# Verify the plot with seaborn 
sns.displot(df['day'])

#### Analysis of Total rides month wise

In [None]:
plt.figure(figsize=(20,8))
for i,month in enumerate(df['month'].unique(),1):
    plt.subplot(3,2,i)
    df_out=df[df['month']==month]
    plt.hist(df_out['day'])
    plt.xlabel('days in month'.format(i))
    plt.ylabel('total rides')
    
## Rush by the hour for every month
## Rides per Hourly, every month

plt.figure(figsize=(20,8))

for i,month in enumerate(df['month'].unique(),1):
    plt.subplot(3,2,i)
    df_out = df[df['month']== month]
    plt.hist(df_out['hour'])

### getting Rush in hour 

In [None]:
# A Pointplot is similar to a line graph
# Rush region 'Lat'(LATITUDE) by hour of the day
sns.pointplot(x='hour',y= 'Lat',data=df)

In [None]:
# Rush region 'Lat'(LATITUDE) by hour per day of the week 

plt.figure(figsize=(20,10))

sns.pointplot(x='hour',
             y='Lat',
             data=df,
             hue='weekday')

In [None]:


sns.pointplot(x=df['month'].value_counts().index, y=df['month'].value_counts(),data =df)

## Grouping 'Base' number by month

In [None]:
base=df.groupby(['Base','month'])['Date/Time'].count().reset_index()
base

#### to analyse which base number gets popular by month name

In [None]:
plt.figure(figsize = (10,5))
sns.pointplot(y= 'Date/Time',
             x= 'month',
             data = df_base,
             hue = 'Base')

## Heatmap of hour,weekday ; Hour,day ; month,day ; month,weekday

#### create pivot_tables

##### simplest way of creating pivot tables,first of all call groupby on 2 columns so that we will get groups 
##### df.groupby(['weekday','hour']).apply(lambda x: len(x)), now "weekday" becomes rows and "hour" becomes cols
##### & then call unstack

In [None]:
# Heatmap of hour,weekday

df_weekday = df[['hour','weekday']]

In [None]:
df_weekday = df.groupby(['weekday','hour'])['Date/Time'].count()
df_weekday

In [None]:
df_weekday.unstack()

##### creating heatmap so that it can be easily visualize

In [None]:
plt.figure(figsize=(20,5))
sns.heatmap(df_weekday.unstack())

In [None]:
# Heatmap of month,weekday
df_month_weekday = df.groupby(['month','weekday'])['weekday'].count()
df_month_weekday

In [None]:
# Create a function to generate heatmaps

def heatmap(x,y):
    df_xy = df.groupby([x,y])[x].count()
    df_xy
    plt.figure(figsize=(20,5))
    return sns.heatmap(df_xy.unstack())

In [None]:
## validating above Analysis through Heatmap
heatmap('hour','day')

In [None]:
heatmap('day','month')

#### Analysing the results
#### We observe that the number of trips increases each month, we can say that from April to September 2014, Uber was in a continuous improvement process.

In [None]:
plt.figure(figsize=(15,10))
plt.ylim(40.60,41)
plt.xlim(-74.2,-73.7)
plt.plot(df['Lon'],df['Lat'],'m+',ms=.5)

#### Analysis of Location data points¶

In [None]:
df_new = df[df['weekday']=='Sunday']
plt.figure(figsize=(15,10))
plt.ylim(40.60,41)
plt.xlim(-74.2,-73.7)
plt.plot(df_new['Lon'],df_new['Lat'],'m+',ms=.5)

##### We can see a number of hot spots here. Midtown Manhattan is clearly a huge bright spot.
##### & these are made from Midtown to Lower Manhattan.
##### Followed by Upper Manhattan and the Heights of Brooklyn.


## Analysis of Jan-June uber_15

In [None]:
df_add = pd.read_csv('/Users/karthik/Downloads/uber-raw-data-janjune-15.csv')
df_add

In [None]:
df_add['Pickup_date'] = pd.to_datetime(df_add['Pickup_date'])

In [None]:
df_add['minute'] = df_add['Pickup_date'].dt.minute
df_add['hour'] = df_add['Pickup_date'].dt.hour
df_add['day'] = df_add['Pickup_date'].dt.day
df_add['month'] = df_add['Pickup_date'].dt.month
df_add['weekday'] = df_add['Pickup_date'].dt.day_name()

In [None]:
plt.figure(figsize=(15,6))
px.bar(x=df_add['month'].value_counts().index,
        y=df_add['month'].value_counts())

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(df_add['hour'])

In [None]:
df_new = pd.read_csv('/Users/karthik/Downloads/Uber-Jan-Feb-FOIL.csv')
df_new

In [None]:
c = df_new.groupby('dispatching_base_number').sum().reset_index()

In [None]:
px.bar(x=c['dispatching_base_number'],y=c['trips'])

In [None]:
plt.plot(c['dispatching_base_number'],c['active_vehicles'])

In [None]:
sns.pointplot(x='dispatching_base_number',y='active_vehicles',data=c,)

In [None]:
df_new['month'] = pd.to_datetime(df_new['date']).dt.month
df_new

In [None]:
d = df_new.groupby(['dispatching_base_number','month']).sum().reset_index()
d

In [None]:
sns.pointplot(x='dispatching_base_number',y='active_vehicles',data=d,hue='month')

In [None]:
sns.pointplot(x='dispatching_base_number',y='trips',data=d,hue='month')

In [None]:
plt.figure(figsize=(18,8))
sns.boxplot(x='dispatching_base_number',y='active_vehicles',data=df_new)

In [None]:
df_new['trip/veh'] = df_new['trips']/df_new['active_vehicles']
df_new

In [None]:
e = df_new.groupby(['dispatching_base_number','date']).mean()

In [None]:
e.reset_index(inplace=True)

In [None]:
plt.figure(figsize=(20,8))
sns.pointplot(x='date',y='trip/veh',data=e, hue ='dispatching_base_number')

##### Uber pickups by the month in NYC

In [None]:
px.bar(x=uber_15['month'].value_counts().index,
           y=uber_15['month'].value_counts().values)

#### We can see that the number of Uber pickup has been steadily increasing throughout the first half of 2015 in NYC