# Bikeshare 

In [12]:
import pandas as pd
import numpy as np
from tabulate import tabulate
import inspect
import seaborn as sns 
import matplotlib.pyplot as plt

In [13]:
day = pd.read_csv('Bike-Sharing-Dataset/day.csv', index_col=0) 
hour = pd.read_csv('Bike-Sharing-Dataset/hour.csv', index_col=0)

In [14]:
day.head()

Unnamed: 0_level_0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


### Functions might be used:

In [15]:
def retrieve_name(var):
        """
        Gets the name of var. Does it from the out most frame inner-wards.
        :param var: variable to get name from.
        :return: string
        """
        for fi in reversed(inspect.stack()):
            names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
            if len(names) > 0:
                return names[0]

In [16]:
def missing(df):

    total = df.isnull().sum().sort_values(ascending=False)
    percent_1 = df.isnull().sum()/df.isnull().count()*100
    percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
    missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
    return missing_data.head(5)

In [17]:
def iter_columns(df):
    for col_name in df.columns: 
        print(col_name)
        

In [18]:
def get_cname(df,col):
    return df.columns[df.columns.get_loc(col)]  # <--- Is ther a nicer way to do this?



In [19]:
def explore(*args):
    for arg in args:
        print('Data exploration for',retrieve_name(arg),':','\n')
        print(arg.info(),'\n')
        #print('Describe the data:\n',arg.describe().round(2),'\n')
        for col in arg:
            if len(arg[col].unique()) < 50:
                print('List unique values of column',print(col),':\n',arg[col].unique()) #get_cname(arg,col)
        print('\n','Missing Values:\n',missing(arg),'\n\n\n') 
        print('Column names for',retrieve_name(arg),':\n')
        for col_name in arg.columns: 
            print(col_name, end='   ')
        print('\n')

### Targets

- Improve availability of bikes for working professionals / salaried employees as they are the most valued customers.
- Improve availability of bikes for casual customers.
- Provide statistics about contribution of weather in bike demands.
- Provide statistics about how traffic and pollution affect sales.



### Questions

- Is there a correlation between temp and feeled temp?
- Is there a correlation between temp, atemp and the toal bike rentals?
- Is there a correlation between season and feeled temp / temp?

## Data Exploration

In [20]:
explore(day, hour)

Data exploration for day : 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 731 entries, 1 to 731
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      731 non-null    object 
 1   season      731 non-null    int64  
 2   yr          731 non-null    int64  
 3   mnth        731 non-null    int64  
 4   holiday     731 non-null    int64  
 5   weekday     731 non-null    int64  
 6   workingday  731 non-null    int64  
 7   weathersit  731 non-null    int64  
 8   temp        731 non-null    float64
 9   atemp       731 non-null    float64
 10  hum         731 non-null    float64
 11  windspeed   731 non-null    float64
 12  casual      731 non-null    int64  
 13  registered  731 non-null    int64  
 14  cnt         731 non-null    int64  
dtypes: float64(4), int64(10), object(1)
memory usage: 91.4+ KB
None 

season
List unique values of column None :
 [1 2 3 4]
yr
List unique values of column None :
 [0 1

## Preprocessing

### Categorize Data

Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv


#### Original columns:

- instant: record index
- dteday : date
- season : season (1:winter, 2:spring, 3:summer, 4:fall)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from [Web Link])
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
 weathersit:
  - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
  - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
  - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered

#### New columns:

- raw_tem
- raw_atemp
- raw_hum

### Data Transformation / Categorization 

In [21]:
# Categorizing numeric values and further transformation. 
# Adding ne columns. 

data = [day,hour] # <-- Ask about how data is saved to a variable. 

season_map = {1:'Winter', 2:'Spring', 3:'Summer', 4:'Fall'}
holiday_map = {0:'No', 1:'Yes'}
yr_map = {0:'2011', 1:'2012'}
workingday_map = {0:'No', 1:'Yes'}
weekday_map = {6:'Saturday', 0:'Sunday', 1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday'}
mnth_map = {1:'January', 2:'February', 3:'March', 4:'April', 5:'May', 6:'June', 7:'July', 8:'August', 9:'September', 10:'October', 11:'November', 12:'December'}
weathersit_map = {1:'nice', 2:'cloudy', 3:'wet', 4:'lousy'}

temp_categories = ['Death','Frigid','Freezing','Very Cold','Cold','Cool','Mild','Warm','Hot','Very Hot','Extremly Hot']
temp_categorie_values = [-100,-15,-10,-3,3,12,17,25,33,40,45,100] # <-- Can i set the outer limits to infinite? 


##y = 'hello'  deep copy and shallow copy 

# All mapings in this for loop will be applied to day and hour. 
for df in data:
    df['season'] = df['season'].map(season_map)
    df['holiday'] = df['holiday'].map(holiday_map)
    df['yr'] = df['yr'].map(yr_map)
    df['workingday'] = df['workingday'].map(workingday_map)
    df['weekday'] = df['weekday'].map(weekday_map)
    df['mnth'] = df['mnth'].map(mnth_map)
    df['weathersit'] = df['weathersit'].map(weathersit_map)
    df['dteday'] = pd.to_datetime(df['dteday'], format='%Y-%m-%d')# <-- How to change the datetime format to '%d.%m.%Y' | This was changing the dtype --> .dt.strftime('%m/%d/%Y')

    df['raw_temp'] = 39*df['temp']-(-8)*df['temp']+(-8)
    df['raw_atemp'] = 39*df['temp']-(-16)*df['temp']+(-16)
    df['raw_hum'] = df['hum']*100
    df['raw_windspeed'] = df['windspeed']*67

    df['cat_temp'] = pd.cut(df['raw_temp'], temp_categorie_values, labels=temp_categories)
    df['cat_atemp'] = pd.cut(df['raw_atemp'], temp_categorie_values, labels=temp_categories)




In [22]:
day.head(100)

Unnamed: 0_level_0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,...,windspeed,casual,registered,cnt,raw_temp,raw_atemp,raw_hum,raw_windspeed,cat_temp,cat_atemp
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2011-01-01,Winter,2011,January,No,Saturday,No,cloudy,0.344167,0.363625,...,0.160446,331,654,985,8.175849,2.929185,80.5833,10.749882,Cold,Very Cold
2,2011-01-02,Winter,2011,January,No,Sunday,No,cloudy,0.363478,0.353739,...,0.248539,131,670,801,9.083466,3.991290,69.6087,16.652113,Cold,Cold
3,2011-01-03,Winter,2011,January,No,Monday,Yes,nice,0.196364,0.189405,...,0.248309,120,1229,1349,1.229108,-5.199980,43.7273,16.636703,Very Cold,Freezing
4,2011-01-04,Winter,2011,January,No,Tuesday,Yes,nice,0.200000,0.212122,...,0.160296,108,1454,1562,1.400000,-5.000000,59.0435,10.739832,Very Cold,Freezing
5,2011-01-05,Winter,2011,January,No,Wednesday,Yes,nice,0.226957,0.229270,...,0.186900,82,1518,1600,2.666979,-3.517365,43.6957,12.522300,Very Cold,Freezing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,2011-04-06,Spring,2011,April,No,Wednesday,Yes,nice,0.390833,0.387608,...,0.263063,413,2395,2808,10.369151,5.495815,47.0833,17.625221,Cold,Cold
97,2011-04-07,Spring,2011,April,No,Thursday,Yes,nice,0.437500,0.433696,...,0.162312,571,2570,3141,12.562500,8.062500,60.2917,10.874904,Cool,Cold
98,2011-04-08,Spring,2011,April,No,Friday,Yes,cloudy,0.335833,0.324479,...,0.226992,172,1299,1471,7.784151,2.470815,83.6250,15.208464,Cold,Very Cold
99,2011-04-09,Spring,2011,April,No,Saturday,No,cloudy,0.342500,0.341529,...,0.133083,879,1576,2455,8.097500,2.837500,87.7500,8.916561,Cold,Very Cold


# Questions:

How do variables get written <->