In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("data/2022/Divvy_Trips_2022_full.csv.gz", compression="gzip")

# Общая информация о датасете

In [4]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.0128,-87.665906,42.01256,-87.674367,casual
1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012763,-87.665967,42.01256,-87.674367,casual
2,BD0F91DFF741C66D,classic_bike,2022-01-25 04:53:40,2022-01-25 04:58:01,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,41.925602,-87.653708,41.92533,-87.6658,member
3,CBB80ED419105406,classic_bike,2022-01-04 00:18:04,2022-01-04 00:33:00,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,41.983593,-87.669154,41.961507,-87.671387,casual
4,DDC963BFDDA51EEA,classic_bike,2022-01-20 01:31:10,2022-01-20 01:37:12,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,41.87785,-87.62408,41.884621,-87.627834,member


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5667717 entries, 0 to 5667716
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 562.1+ MB


In [14]:
# Фильтруем датасет: строчки с началом позже конца и с пустыми значениями начала/конца не нужны
df = df[df['started_at']<df['ended_at']]
df = df.dropna(subset=['started_at', 'ended_at'])

## Работа с данными

In [15]:
# Выбор нужных колонок датасета - координаты, название и id станций

# Данные начальных станций
station_names = df[["start_station_name"]]
station_names = station_names.rename(columns={"start_station_name": "station_name"})

# Данные конечных станций
station_end_names = df[["end_station_name"]]
station_end_names = station_end_names.rename(
    columns={"end_station_name": "station_name"}
)

In [16]:
# Склейка данных в 1 датасет
station_names = pd.concat([station_names, station_end_names])

In [17]:
station_names

Unnamed: 0,station_name
0,Glenwood Ave & Touhy Ave
1,Glenwood Ave & Touhy Ave
2,Sheffield Ave & Fullerton Ave
3,Clark St & Bryn Mawr Ave
4,Michigan Ave & Jackson Blvd
...,...
5667712,Peoria St & Jackson Blvd
5667713,Seeley Ave & Roscoe St
5667714,Green St & Madison St
5667715,Peoria St & Jackson Blvd


## Подсчет популярности отдельных станций

In [18]:
# Расчитаем количество повторений каждой станции
popular_station = pd.DataFrame(station_names["station_name"].value_counts())

In [19]:
# Возвращаем индексы и переназываем столбцы
popular_station = popular_station.reset_index()
popular_station.columns = ['station_name', 'count']

popular_station

Unnamed: 0,station_name,count
0,Streeter Dr & Grand Ave,150593
1,DuSable Lake Shore Dr & North Blvd,82225
2,DuSable Lake Shore Dr & Monroe St,81399
3,Michigan Ave & Oak St,79780
4,Wells St & Concord Ln,74933
...,...,...
1704,Paul Revere Elementary School,1
1705,Public Rack - May St & 87th St,1
1706,Public Rack - Lotus Ave & Harrison St,1
1707,Public Rack - 83rd St (Avalon Park) Metra,1


## Самые популярные маршруты

In [20]:
trip = df[["start_station_name", "end_station_name"]]

In [21]:
# Расчитаем количество повторений каждого маршрута
popular_trip = pd.DataFrame(trip[["start_station_name", "end_station_name"]].value_counts())

In [22]:
# Возвращаем индексы и переназываем столбцы
popular_trip = popular_trip.reset_index()
popular_trip.columns = ['start_name', 'stop_name', 'count']

popular_trip

Unnamed: 0,start_name,stop_name,count
0,Streeter Dr & Grand Ave,Streeter Dr & Grand Ave,12192
1,DuSable Lake Shore Dr & Monroe St,DuSable Lake Shore Dr & Monroe St,7374
2,Ellis Ave & 60th St,University Ave & 57th St,6797
3,University Ave & 57th St,Ellis Ave & 60th St,6362
4,Ellis Ave & 60th St,Ellis Ave & 55th St,6359
...,...,...,...
174367,Lincoln Ave & Diversey Pkwy,Shields Ave & 28th Pl,1
174368,Lincoln Ave & Diversey Pkwy,State St & 19th St,1
174369,Lincoln Ave & Diversey Pkwy,Stetson Ave & South Water St,1
174370,Lincoln Ave & Diversey Pkwy,Throop St & Taylor St,1
