# Exploratory Data Analysis: NYC Taxi Tipping Behavior

This notebook explores other factors that may influence tip amount, identifies unnecessary columns, and investigates evidence of seasonal tipping behavior.

In [None]:
import pandas as pd

# Load cleaned CSV
df = pd.read_csv("taxi_cleaned.csv")

# Convert datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

# Add derived columns
df['month'] = df['tpep_pickup_datetime'].dt.month
df['hour'] = df['tpep_pickup_datetime'].dt.hour
df['season'] = df['month'].apply(
    lambda x: 'Winter' if x in [12, 1, 2] else (
        'Spring' if x in [3, 4, 5] else (
            'Summer' if x in [6, 7, 8] else 'Fall'
        )
    )
)

# Filter zero fare
df = df[df['fare_amount'] > 0]
df['tip_percent'] = (df['tip_amount'] / df['fare_amount']) * 100
df.head()

### Correlation with Tip Amount and Tip Percentage

In [None]:
# Correlation with tip_amount
corr_tip_amount = df.corr(numeric_only=True)['tip_amount'].sort_values(ascending=False)
print("Correlation with tip_amount:\n", corr_tip_amount)

# Correlation with tip_percent
corr_tip_percent = df.corr(numeric_only=True)['tip_percent'].sort_values(ascending=False)
print("\nCorrelation with tip_percent:\n", corr_tip_percent)

### Columns Likely Not Needed

In [None]:
likely_unneeded_columns = [
    'VendorID', 'RatecodeID', 'store_and_fwd_flag', 'extra', 'mta_tax',
    'improvement_surcharge', 'congestion_surcharge',
    'tpep_dropoff_datetime', 'DOLocationID'
]
print("Columns that can likely be dropped for seasonal tip analysis:")
print(likely_unneeded_columns)

### Average Tip % by Season

In [None]:
seasonal_tip_percent = df.groupby('season')['tip_percent'].mean().reset_index()
seasonal_tip_percent.sort_values(by='tip_percent', ascending=False)