# Bikeshare 

In [56]:
import pandas as pd
import numpy as np
from tabulate import tabulate
import inspect
import seaborn as sns 
import matplotlib.pyplot as plt

In [57]:
day = pd.read_csv('Bike-Sharing-Dataset/day.csv') #index_coulmns 0
hour = pd.read_csv('Bike-Sharing-Dataset/hour.csv')

### Functions might be used:

In [58]:
def retrieve_name(var):
        """
        Gets the name of var. Does it from the out most frame inner-wards.
        :param var: variable to get name from.
        :return: string
        """
        for fi in reversed(inspect.stack()):
            names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
            if len(names) > 0:
                return names[0]

In [59]:
def missing(df):

    total = df.isnull().sum().sort_values(ascending=False)
    percent_1 = df.isnull().sum()/df.isnull().count()*100
    percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
    missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
    return missing_data.head(5)

In [60]:
def iter_columns(df):
    for col_name in df.columns: 
        print(col_name)
        

In [61]:
def get_cname(df,col):
    return df.columns[df.columns.get_loc(col)]  # <--- Is ther a nicer way to do this?



In [62]:
def explore(*args):
    for arg in args:
        print('Data exploration for',retrieve_name(arg),':','\n')
        print(arg.info(),'\n')
        #print('Describe the data:\n',arg.describe().round(2),'\n')
        for col in arg:
            if len(arg[col].unique()) < 50:
                print('List unique values of column',print(col),':\n',arg[col].unique()) #get_cname(arg,col)
        print('\n','Missing Values:\n',missing(arg),'\n\n\n') 
        print('Column names for',retrieve_name(arg),':\n')
        for col_name in arg.columns: 
            print(col_name, end='   ')
        print('\n')

### Targets

- Improve availability of bikes for working professionals / salaried employees as they are the most valued customers.
- Improve availability of bikes for casual customers.
- Provide statistics about contribution of weather in bike demands.
- Provide statistics about how traffic and pollution affect sales.



### Questions

- Is there a correlation between temp and feeled temp?
- Is there a correlation between temp, atemp and the toal bike rentals?
- Is there a correlation between season and feeled temp / temp?

## Data Exploration

In [63]:
explore(day, hour)

Data exploration for day : 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB
None 

season
List unique values of column None :
 [1 2 3 4]
yr


## Preprocessing

### Categorize Data

Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv


#### Original columns:

- instant: record index
- dteday : date
- season : season (1:winter, 2:spring, 3:summer, 4:fall)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from [Web Link])
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
 weathersit:
  - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
  - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
  - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered

#### New columns:

- day: add hours
- raw_tem
- raw_atemp
- raw_hum

### Data Transformatin / Categorization 

In [64]:
# Calculating the raw weather values
def raw_temp(temp):
    """
    Reverse normalized temperature (temp):
    tMin = -8
    tMax =  +39
    normTemp = (tempRaw-tMin)/(tMax-tMin)
    tempRaw = tMax*normTemp-tMin*normTemp+tMin
    returns tempRaw
    """    
    return 39*temp-(-8)*temp+(-8)

def raw_atemp(atemp):
    """
    Reverse normalized feeling temperature (atemp):
    tMin = -16
    tMax =  +50
    normTemp = (atempRaw-tMin)/(tMax-tMin)
    atempRaw = tMax*normTemp-tMin*normTemp+tMin
    returns atempRaw
    """
    return 50*atemp-(-8)*atemp+(-8)

def raw_hum(hum):
    return hum*100



In [65]:
# Categorizing numeric values and further transformation. 

data = [day,hour] # <-- Ask about how data is saved to a variable. 

season_map = {1:'Winter', 2:'Spring', 3:'Summer', 4:'Fall'}
holiday_map = {0:'No', 1:'Yes'}
yr_map = {0:'2011', 1:'2012'}
workingday_map = {0:'No', 1:'Yes'}
weekday_map = {6:'Saturday', 0:'Sunday', 1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday'}
mnth_map = {1:'January', 2:'February', 3:'March', 4:'April', 5:'May', 6:'June', 7:'July', 8:'August', 9:'September', 10:'October', 11:'November', 12:'December'}
weathersit_map = {1:'nice', 2:'cloudy', 3:'wet', 4:'lousy'}

# All mapings in this for loop will be applied to day and hour. 
for df in data:
    df['season'] = df['season'].map(season_map)
    df['holiday'] = df['holiday'].map(holiday_map)
    df['yr'] = df['yr'].map(yr_map)
    df['workingday'] = df['workingday'].map(workingday_map)
    df['weekday'] = df['weekday'].map(weekday_map)
    df['mnth'] = df['mnth'].map(mnth_map)
    df['weathersit'] = df['weathersit'].map(weathersit_map)
    df['dteday'] = pd.to_datetime(df['dteday'], format='%Y-%m-%d')# <-- How to change the datetime format to '%d.%m.%Y' | This was changing the dtype --> .dt.strftime('%m/%d/%Y')


In [66]:
for df in data:    
    df['raw_temp'] = raw_temp(df['temp'])
    df['raw_atemp'] = raw_atemp(df['atemp'])
    df['raw_hum'] = raw_hum(df['hum'])


In [67]:
# Converting temperature in categories


labels=['Death','Frigid','Freezing','Very Cold','Cold','Cool','Mild','Warm','Hot','Very Hot','Extremly Hot']
bins= [-100,-15,-10,-3,3,12,17,25,33,40,45,100]

for df in data:
    df['cat_temp'] = pd.cut(df['temp'], bins, labels=labels)
    df['cat_atemp'] = pd.cut(df['atemp'], bins, labels=labels)

In [68]:
day

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,...,hum,windspeed,casual,registered,cnt,raw_temp,raw_atemp,raw_hum,cat_temp,cat_atemp
0,1,2011-01-01,Winter,2011,January,No,Saturday,No,cloudy,0.344167,...,0.805833,0.160446,331,654,985,8.175849,13.090250,80.5833,Very Cold,Very Cold
1,2,2011-01-02,Winter,2011,January,No,Sunday,No,cloudy,0.363478,...,0.696087,0.248539,131,670,801,9.083466,12.516862,69.6087,Very Cold,Very Cold
2,3,2011-01-03,Winter,2011,January,No,Monday,Yes,nice,0.196364,...,0.437273,0.248309,120,1229,1349,1.229108,2.985490,43.7273,Very Cold,Very Cold
3,4,2011-01-04,Winter,2011,January,No,Tuesday,Yes,nice,0.200000,...,0.590435,0.160296,108,1454,1562,1.400000,4.303076,59.0435,Very Cold,Very Cold
4,5,2011-01-05,Winter,2011,January,No,Wednesday,Yes,nice,0.226957,...,0.436957,0.186900,82,1518,1600,2.666979,5.297660,43.6957,Very Cold,Very Cold
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,2012-12-27,Winter,2012,December,No,Thursday,Yes,cloudy,0.254167,...,0.652917,0.350133,247,1867,2114,3.945849,5.145236,65.2917,Very Cold,Very Cold
727,728,2012-12-28,Winter,2012,December,No,Friday,Yes,cloudy,0.253333,...,0.590000,0.155471,644,2451,3095,3.906651,6.792668,59.0000,Very Cold,Very Cold
728,729,2012-12-29,Winter,2012,December,No,Saturday,No,cloudy,0.253333,...,0.752917,0.124383,159,1182,1341,3.906651,6.059200,75.2917,Very Cold,Very Cold
729,730,2012-12-30,Winter,2012,December,No,Sunday,No,nice,0.255833,...,0.483333,0.350754,364,1432,1796,4.024151,5.438600,48.3333,Very Cold,Very Cold


# Questions:

How do variables get written <->