In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/Divvy_Trips_2022_full.csv.gz", compression="gzip")

# Общая информация о датасете

In [3]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.0128,-87.665906,42.01256,-87.674367,casual
1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012763,-87.665967,42.01256,-87.674367,casual
2,BD0F91DFF741C66D,classic_bike,2022-01-25 04:53:40,2022-01-25 04:58:01,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,41.925602,-87.653708,41.92533,-87.6658,member
3,CBB80ED419105406,classic_bike,2022-01-04 00:18:04,2022-01-04 00:33:00,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,41.983593,-87.669154,41.961507,-87.671387,casual
4,DDC963BFDDA51EEA,classic_bike,2022-01-20 01:31:10,2022-01-20 01:37:12,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,41.87785,-87.62408,41.884621,-87.627834,member


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5667717 entries, 0 to 5667716
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 562.1+ MB


## Работа с данными

In [5]:
station_park = [
    "Field Museum",
    "Canal St & Adams St",
    "Clinton St & Madison St",
    "Loomis St & Lexington St",
    "Ellis Ave & 60th St ",
]

In [6]:
# Выбор нужных колонок датасета - координаты, название и id станций

# Данные начальных станций
station_names = df[["start_station_name"]]
station_names = station_names.rename(columns={"start_station_name": "station_name"})

# Данные конечных станций
station_end_names = df[["end_station_name"]]
station_end_names = station_end_names.rename(
    columns={"end_station_name": "station_name"}
)

In [77]:
# Склейка данных в 1 датасет
station_names = pd.concat([station_names, station_end_names])
station_names = station_names.drop_duplicates().dropna()
df.rename(columns={"end_station_name": "station_name"}, inplace=True)

park = station_names.join(
    df.set_index("station_name")[["started_at", "member_casual"]], on="station_name"
)

In [78]:
park.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4774992 entries, 0 to 5558381
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   station_name   object
 1   started_at     object
 2   member_casual  object
dtypes: object(3)
memory usage: 145.7+ MB


In [79]:
park = park[park.station_name.isin(station_park) == True]

In [80]:
park["started_at"] = park["started_at"].astype("datetime64[D]")

In [81]:
park

Unnamed: 0,station_name,started_at,member_casual
1057,Canal St & Adams St,2022-01-20,member
1057,Canal St & Adams St,2022-01-12,member
1057,Canal St & Adams St,2022-01-25,member
1057,Canal St & Adams St,2022-01-13,member
1057,Canal St & Adams St,2022-01-31,member
...,...,...,...
2213,Field Museum,2022-12-18,casual
2213,Field Museum,2022-12-18,casual
2213,Field Museum,2022-12-01,casual
2213,Field Museum,2022-12-06,casual


In [82]:
park = park[(park["started_at"] >= "2022-07-11") & (park["started_at"] < "2022-07-18")]

In [83]:
park.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2083 entries, 1057 to 2213
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   station_name   2083 non-null   object        
 1   started_at     2083 non-null   datetime64[ns]
 2   member_casual  2083 non-null   object        
dtypes: datetime64[ns](1), object(2)
memory usage: 65.1+ KB


In [84]:
park = pd.DataFrame(
    park.groupby(["station_name", "started_at"]).member_casual.value_counts()
)

In [85]:
park.rename(columns={"member_casual": "member_casual_count"}, inplace=True)

In [86]:
park["all_users"] = park.groupby(
    ["station_name", "started_at"]
).member_casual_count.sum()

In [87]:
park = park.reset_index(["station_name", "started_at", "member_casual"])

In [88]:
park.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   station_name         56 non-null     object        
 1   started_at           56 non-null     datetime64[ns]
 2   member_casual        56 non-null     object        
 3   member_casual_count  56 non-null     int64         
 4   all_users            56 non-null     int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 2.3+ KB


In [89]:
park["percent"] = park["member_casual_count"] / park["all_users"] * 100

In [94]:
park = pd.DataFrame(park.groupby(["station_name", "member_casual"])["percent"].median())

In [116]:
park

Unnamed: 0_level_0,Unnamed: 1_level_0,percent
station_name,member_casual,Unnamed: 2_level_1
Canal St & Adams St,casual,39.777674
Canal St & Adams St,member,60.222326
Clinton St & Madison St,casual,31.223734
Clinton St & Madison St,member,68.776266
Field Museum,casual,67.05522
Field Museum,member,32.94478
Loomis St & Lexington St,casual,13.453365
Loomis St & Lexington St,member,86.546635


In [130]:
percent = park.groupby("member_casual")["percent"].median().reset_index("member_casual")

member_percent = float(percent[percent["member_casual"] == "member"]["percent"])
casual_percent = float(percent[percent["member_casual"] == "casual"]["percent"])

In [131]:
print(member_percent, casual_percent)

64.49929584025246 35.50070415974754
