# import libraries

In [1]:
# data manipulation 
import numpy as np
import pandas as pd

# plotting
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

# setting params
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (30, 10),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-large',
          'ytick.labelsize':'x-large'}

sn.set_style('whitegrid')
sn.set_context('talk')

plt.rcParams.update(params)
pd.options.display.max_colwidth = 600

# pandas display data frames as tables
from IPython.display import display, HTML


# load dataset

In [2]:
df = pd.read_csv('/Users/ingluissantana/Desktop/Py/PP1_bike_sharing_MVG/MVG_Rad_Fahrten_2021.csv', delimiter= ';', index_col=0)

# quality and cleaning of the data

In [3]:
#to make the data more readable i will change everything to lowercase and use a _ instead of a space

df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)
string_columns = string_columns[2:]

for col in string_columns:
    df[col] = df[col].str.lower().str.rstrip().str.replace(' ', '_')

## changing the types to date format

In [4]:
df['starttime'] = pd.to_datetime(df["starttime"])
df['endtime'] = pd.to_datetime(df["endtime"])

# quick view of the data

In [5]:
df.head().T

Row,1,2,3,4,5
starttime,2021-01-01 01:01:00,2021-01-01 01:19:00,2021-01-01 01:48:00,2021-01-01 01:48:00,2021-01-01 03:26:00
endtime,2021-01-01 01:12:00,2021-01-01 01:59:00,2021-01-01 02:01:00,2021-01-01 02:00:00,2021-01-01 03:39:00
startlat,48.1258,48.12919,48.08189,48.08189,48.11587
startlon,11.64784,11.62583,11.63264,11.63264,11.62543
endlat,48.12948,48.14853,48.07975,48.07975,48.109
endlon,11.62539,11.53142,11.61032,11.61032,11.6524
rental_is_station,1,0,1,1,0
rental_station_name,kreillerstraße,,universitätsstraße_neubiberg,universitätsstraße_neubiberg,
return_is_station,0,0,1,1,0
return_station_name,,,fasanenpark_ost_neubiberg,fasanenpark_ost_neubiberg,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 619573 entries, 1 to 619573
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   starttime            619573 non-null  datetime64[ns]
 1   endtime              619573 non-null  datetime64[ns]
 2   startlat             619573 non-null  float64       
 3   startlon             619573 non-null  float64       
 4   endlat               619573 non-null  float64       
 5   endlon               619573 non-null  float64       
 6   rental_is_station    619573 non-null  int64         
 7   rental_station_name  619573 non-null  object        
 8   return_is_station    619573 non-null  int64         
 9   return_station_name  619573 non-null  object        
dtypes: datetime64[ns](2), float64(4), int64(2), object(2)
memory usage: 52.0+ MB


# unique, duplicated and null values

In [7]:
df.agg(['count', 'size', 'nunique'])



Unnamed: 0,starttime,endtime,startlat,startlon,endlat,endlon,rental_is_station,rental_station_name,return_is_station,return_station_name
count,619573,619573,619573,619573,619573,619573,619573,619573,619573,619573
size,619573,619573,619573,619573,619573,619573,619573,619573,619573,619573
nunique,272997,273299,14609,19791,15751,21161,2,321,2,319


In [8]:
df.isnull().all()

starttime              False
endtime                False
startlat               False
startlon               False
endlat                 False
endlon                 False
rental_is_station      False
rental_station_name    False
return_is_station      False
return_station_name    False
dtype: bool

In [9]:
# duplicate = df[df.duplicated(string_columns)]
# duplicate

- valores duplicados no aportan informacion

In [31]:
visitados = df[string_columns].value_counts()
visitados.head(20)


rental_station_name             return_station_name
                                                       359979
sandstraße                                               2732
münchner_freiheit                                        2641
olympiazentrum                                           2412
maillingerstraße                                         2084
                                sandstraße               2019
tum_arcisstraße                                          1987
rotkreuzplatz                                            1971
technische_universität_münchen                           1744
leonrodplatz                                             1741
universität                                              1716
goetheplatz_(nord)                                       1610
                                münchner_freiheit        1566
                                rotkreuzplatz            1563
hauptbahnhof_nord                                        1526
hackerbrücke      

# eda