# Working with latitudes and longitudes

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

Geo-based features are a class of features present in range of datasets. These features contain records about the geographical location of a place/point in space. Features like Longitudes, Latitudes, and Address are geo-features that need to be engineered.

In [None]:
stations = pd.read_csv("data/stations_clean.csv")
stations.head()

In [None]:
station2537 = stations.iloc[0]
station2572 = stations.iloc[1]

## Manhattan distance

The Manhattan distance is the sum of the horizontal and vertical distance between two points. Let’s demonstrate this below using the dataset:

In [None]:
def manhattan_distance(lat1, lng1, lat2, lng2):
    a = np.abs(lat2 -lat1)
    b = np.abs(lng1 - lng2)
    return a + b

In [None]:
stations

In [None]:
manhattan_distance(station2537['latitude'], station2537['longitude'], station2572['latitude'], station2572['longitude'])

In [None]:
# datasist MODULE
from datasist.feature_engineering import manhattan_distance

manhattan_distance(station2537['latitude'], station2537['longitude'], station2572['latitude'], station2572['longitude'])

## Haversine distance

The Haversine distance is the great-circle distance between two points on a sphere, given their longitudes and latitudes. It’s very important in navigation.

In [None]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

In [None]:
haversine_array(station2537['latitude'], station2537['longitude'], station2572['latitude'], station2572['longitude'])

In [None]:
# datasist MODULE
from datasist.feature_engineering import haversine_distance

haversine_distance(station2537['latitude'], station2537['longitude'], station2572['latitude'], station2572['longitude'])

## Bearing

The bearing is the compass direction used to travel from a starting point, and must be within the range 0 to 360.

In [None]:
def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [None]:
bearing_array(station2537['latitude'], station2537['longitude'], station2572['latitude'], station2572['longitude'])

In [None]:
# datasist MODULE
from datasist.feature_engineering import bearing

bearing(station2537['latitude'], station2537['longitude'], station2572['latitude'], station2572['longitude'])

## Example: Bike Sharing Demand dataset

Cilj: Predict daily ridership totals.

In [None]:
# read the trips dataset
trips = pd.read_csv("data/bike_trips_clean.csv")
trips.dropna(inplace=True)
trips.head()

In [None]:
# read the stations dataset
stations = pd.read_csv("data/stations_clean.csv")
stations.head()

- Preverimo in uredimo podatkovne tipe

In [None]:
trips["start_time"] = pd.to_datetime(trips["start_time"])
trips["trip_id"] = trips["trip_id"].astype("int")
trips["end_station_id"] = trips["end_station_id"].astype("int")
trips["start_station_id"] = trips["start_station_id"].astype("int")

In [None]:
trips.info()

In [None]:
trips.isnull().sum()

- Uredimo časovne značilke

In [None]:
def get_time_features(trips):
    """Takes a date and returns day of week, month, hour and 
    whether it is a weekday/workhour"""
    ser = trips["start_time"]
    trips["date"] = ser.dt.date
    trips["time"] = ser.dt.time
    trips["year"] = ser.dt.year
    trips["month"] = ser.dt.month
    trips["day"] = ser.dt.day
    trips["dow"] = ser.dt.dayofweek
    trips["dow_name"] = ser.dt.day_name()
    trips["hour"] = ser.dt.hour
    trips["weekday"] = trips["dow"].apply(lambda x: 1 if x < 5 else 0)
    trips["workhour"] = trips["hour"].apply(lambda x: 1 if x in [8,17] else 0)
    return trips

In [None]:
trips_full = get_time_features(trips)
trips_full.drop(columns= ["start_time"], inplace=True)

In [None]:
trips_full.head()

To get a quick understanding of the periodic patterns of the data, let us have a look at the average demand per hour during a week.

In [None]:
trips_full.groupby('dow_name').dow_name.count().plot(kind="bar")
plt.show()

In [None]:
trips_full.groupby(["dow_name", "hour"]).dow_name.count()["Friday"].plot()
plt.show()

Ridership totals during different months

In [None]:
tripsByMonth = trips_full.groupby('month').month.count()
tripsByMonth.index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                      'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

ax = sns.barplot(x='index', y='month', data=tripsByMonth.reset_index(), color='red')
ax.figure.set_size_inches(7,4)
sns.set_style(style='white')
ax.axes.set_title('Total Rides in Each Month', fontsize=24)
ax.set_xlabel('Month', size=20)
ax.set_ylabel('Rides', size=20)
ax.tick_params(labelsize=16)
plt.show()

In [None]:
trips_by_year_month = trips_full
trips_by_year_month = trips_by_year_month.groupby(['month','year']).month.count()
trips_by_year_month

In [None]:
tripsFullYears = trips_full[trips_full['year'].isin([2014,2015])]
tripsFullYears = tripsFullYears.groupby(['month', 'year'])["trip_id"].count()
tripsByMonth = pd.DataFrame(tripsFullYears)
ax = sns.barplot(x='month', y="trip_id", hue='year', data=tripsByMonth.reset_index(), color='red')
ax.figure.set_size_inches(10,5)
sns.set_style(style='white')
ax.axes.set_title('Total Rides in Each Month', fontsize=24)
ax.set_xlabel('Month', size=20)
ax.set_ylabel('Rides', size=20)
ax.tick_params(labelsize=16)
plt.show()

---

Can we predict daily ridership totals based off of the weather?

In [None]:
weather = pd.read_csv("data/austin_weather.csv", na_values="-")
weather.head()

In [None]:
weather["Events"].unique()

In [None]:
weather = pd.read_csv("data/austin_weather.csv", na_values="-")

weather_new = pd.DataFrame()
weather_new["date"] = pd.to_datetime(weather["Date"])
weather_new['rain'] = np.where(weather['Events'].str.contains('Rain'), 1, 0)
weather_new['thunderstorm'] = np.where(weather['Events'].str.contains('Thunderstorm'), 1, 0)
weather_new['fog'] = np.where(weather['Events'].str.contains('Fog'), 1, 0)
weather_new['snow'] = np.where(weather['Events'].str.contains('Snow'), 1, 0)
weather_new['temp_avg'] = weather['TempAvgF'].astype("int")
weather_new['humidity_avg'] = weather['HumidityAvgPercent'].astype("float")
weather_new['wind_avg'] = weather['WindAvgMPH'].astype("float")
weather_new['wind_gust'] = weather['WindGustMPH'].astype("float")
# Convert traces of rain to .001 inches of rain to recognize that there was perciptation 
# but it was a value less than what could be measured.
weather_new['precipitation_inches'] = np.where(weather['PrecipitationSumInches'] == 'T', 0.001, weather['PrecipitationSumInches'])
weather_new['precipitation_inches'] = weather_new['precipitation_inches'].astype("float")
# select the range
weather_new = weather_new[(weather_new['date'] >= '2014-01-01') & (weather_new['date'] <= '2015-12-31')]
weather_new = weather_new.set_index('date', drop=True)
weather_new = weather_new.fillna(weather_new.mean(numeric_only=True),)

In [None]:
weather_new.head(10)

In [None]:
weather_new.isnull().sum()

Next lets clean group the ridership data by dates and create our targets

In [None]:
trips_full.head()

In [None]:
trips = trips_full[trips_full['year'].isin([2014,2015])]
trips = trips.groupby(['date'])["trip_id"].count().to_frame()
trips.columns = ['trip_count']
trips.head()

In [None]:
def get_time_features(trips):
    """Takes a date and returns day of week, month, hour and 
    whether it is a weekday/workhour"""
    ser = pd.to_datetime(trips.index)
    trips["year"] = ser.year
    trips["month"] = ser.month
    trips["day"] = ser.day
    trips["dow"] = ser.dayofweek
    trips["weekday"] = trips["dow"].apply(lambda x: 1 if x < 5 else 0)
    return trips

In [None]:
trips = get_time_features(trips).copy()

In [None]:
trips.head()

In [None]:
# Inner join ridership and weather data together
trips_with_weather = pd.merge(right=trips, left=weather_new, how='inner', left_index=True, right_index=True)

In [None]:
trips_with_weather

In [None]:
from sklearn.metrics import mean_squared_error

def get_scores(trained_model, X_train, X_test, y_train, y_test):
    pred = trained_model.predict(X_test)
    mse = np.sqrt(mean_squared_error(y_test, pred))
    print(f'Mean error: {mse:.5} min ({mse/np.mean(pred)*100:3.3}%)')
    print(f"Training set score: {trained_model.score(X_train, y_train):.2f}")
    print(f"Test set score: {trained_model.score(X_test, y_test):.2f}")

Prepare data for learning

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( 
    trips_with_weather.drop(columns=["trip_count"]), trips_with_weather["trip_count"], test_size = .3, random_state = 13, shuffle=True)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha = 0.5)
reg.fit(X_train, y_train)

In [None]:
get_scores(reg, X_train, X_test, y_train, y_test)

Wow this did terribly lets graph observed vs predicted to see visually how bad it looks.

In [None]:
def plot_prediction_vs_actual(trained_model, X_train, X_test, y_train, y_test):
    rideCountsPredictions = trained_model.predict(X_test)
    rideCountsActual = y_test
    ax = sns.regplot(x=rideCountsActual, y=rideCountsPredictions)
    ax.figure.set_size_inches(10,6)
    ax.axes.set_title('Predictions Vs. Actual', fontsize=24)
    ax.set_xlabel('Actual', fontsize=20)
    ax.set_ylabel('Predictions', fontsize=20)
    ax.tick_params(labelsize=16)
    plt.show()

In [None]:
plot_prediction_vs_actual(reg, X_train, X_test, y_train, y_test)

Lets try some more feature engineering

In [None]:
from sklearn.linear_model import Lasso

reg = Lasso(alpha=0.1)
reg.fit(X_train, y_train)

get_scores(reg, X_train, X_test, y_train, y_test)
plot_prediction_vs_actual(reg, X_train, X_test, y_train, y_test)

lets take a look at what dates are the outliers. It looks like the core of the data is folowing a trend line of y=x. However, there are a significant amount of days that are extreme outliers, total rides > 1500

In [None]:
outlierTrips = trips_with_weather[trips_with_weather["trip_count"] > 1500]
outlierTrips

In [None]:
from datetime import datetime

def spring_break_woo(date):
    if (date >= datetime(2015, 3, 14)) & (date <= datetime(2015, 3, 23)):
        return 1
    if (date >= datetime(2014, 3, 8)) & (date <= datetime(2014, 3, 17)):
        return 1
    return 0

In [None]:
trips_with_weather['spring_break'] = trips_with_weather.apply(lambda row: spring_break_woo(row.name), axis=1)
trips_with_weather.head()

Quickly lets check to see if this made any improvements

In [None]:
def run_model(model, data):
    X_train, X_test, y_train, y_test = train_test_split(trips_with_weather.drop(columns=["trip_count"]), 
                                                        trips_with_weather["trip_count"], 
                                                        test_size = .3, 
                                                        random_state = 13, shuffle=True)
    
    model.fit(X_train, y_train)

    get_scores(model, X_train, X_test, y_train, y_test)
    plot_prediction_vs_actual(model, X_train, X_test, y_train, y_test)

In [None]:
run_model(Lasso(alpha=0.1), trips_with_weather)

In [None]:
from sklearn.tree import DecisionTreeRegressor

run_model(DecisionTreeRegressor(max_depth=15), trips_with_weather)

Encoding Categorical Variables

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
def run_model(model, data):
    X_train, X_test, y_train, y_test = train_test_split(trips_with_weather.drop(columns=["trip_count"]), 
                                                        trips_with_weather["trip_count"], 
                                                        test_size = .3, 
                                                        random_state = 13, shuffle=True)
    
    
    vars_categorical = ["year", "month", "day", "dow", "weekday"]
    
    # then we instantiate the imputer within a pipeline
    one_hot_encoder = Pipeline(steps=[
        ('one_hot_encoder', OneHotEncoder(categories='auto', drop='first', sparse_output=False)),
    ])

    # then we put the features list and the imputer in the column transformer
    preprocessor = ColumnTransformer(transformers=[
        ('cat_encoder', one_hot_encoder, vars_categorical)
        ], remainder='passthrough')

    # now we fit the preprocessor
    preprocessor.fit(X_train)
    X_train = preprocessor.transform(X_train)
    X_test = preprocessor.transform(X_test)
        
    model.fit(X_train, y_train)

    get_scores(model, X_train, X_test, y_train, y_test)
    plot_prediction_vs_actual(model, X_train, X_test, y_train, y_test)
    
run_model(Lasso(alpha=0.1), trips_with_weather)