In [1]:
import pandas as pd
import numpy as np

### 1. 데이터 불러오기

In [2]:
final_df = pd.read_csv("./final1.csv", encoding='cp949')

final_df['등록일시'] = pd.to_datetime(final_df['등록일시'])
final_df.drop(['Unnamed: 0'], axis=1, inplace=True)
final_df.head()

Unnamed: 0,자전거번호,등록일시,이용시간,이용거리,고장구분
0,SPB-00003,2020-04-01,3,360,0
1,SPB-00003,2020-04-01,22,1961,0
2,SPB-00003,2020-04-01,38,3273,0
3,SPB-00003,2020-04-01,19,1696,0
4,SPB-00003,2020-04-01,6,609,0


In [3]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54044163 entries, 0 to 54044162
Data columns (total 5 columns):
 #   Column  Dtype         
---  ------  -----         
 0   자전거번호   object        
 1   등록일시    datetime64[ns]
 2   이용시간    int64         
 3   이용거리    int64         
 4   고장구분    int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 2.0+ GB


### 2. 누적이용시간, 누적이동거리, 누적이용횟수, 누적고장횟수

In [4]:
#누적 이용시간 & 누적 이동거리, 이용횟수 빈 열 생성
final_df.loc[:, '누적이용시간'] = np.nan
final_df.loc[:, '누적이용거리'] = np.nan
final_df.loc[:, '누적이용횟수'] = np.nan
final_df.loc[:, '누적고장횟수'] = np.nan

#이용시간과 이동거리를 계속 합하다가 고장구분에 1을 만나면 기록
#자전거 넘버가 바뀌면 0으로 리셋, 다시시작

final_df['누적이용시간'] = final_df.groupby(by=['자전거번호'])['이용시간'].apply(lambda x: x.cumsum())
final_df['누적이용거리'] = final_df.groupby(by=['자전거번호'])['이용거리'].apply(lambda x: x.cumsum())
final_df['누적이용횟수'] = final_df.groupby(by=['자전거번호']).cumcount()+1
final_df['누적고장횟수'] = final_df.groupby(by=['자전거번호'])['고장구분'].apply(lambda x: x.cumsum())

In [5]:
final_df.head()

Unnamed: 0,자전거번호,등록일시,이용시간,이용거리,고장구분,누적이용시간,누적이용거리,누적이용횟수,누적고장횟수
0,SPB-00003,2020-04-01,3,360,0,3,360,1,0
1,SPB-00003,2020-04-01,22,1961,0,25,2321,2,0
2,SPB-00003,2020-04-01,38,3273,0,63,5594,3,0
3,SPB-00003,2020-04-01,19,1696,0,82,7290,4,0
4,SPB-00003,2020-04-01,6,609,0,88,7899,5,0


### 3. 이용강도, 평균이용거리

In [6]:
#이용강도 = 단위시간 당 평균적으로 얼마를 달렸는지
#평균이용거리 = 1회 당 평균적으로 얼마를 달렸는지
final_df['이용강도'] = final_df['누적이용거리'] / final_df['누적이용시간']
final_df['평균이용거리'] = final_df['누적이용거리'] / final_df['누적이용횟수']

#모두 정수화
final_df = final_df.astype({'이용강도' : int, '평균이용거리' : int}, errors='raise')

In [7]:
#필요없는 열 삭제
final_df.drop(['이용시간', '이용거리'], axis=1, inplace=True)
final_df.head()

Unnamed: 0,자전거번호,등록일시,고장구분,누적이용시간,누적이용거리,누적이용횟수,누적고장횟수,이용강도,평균이용거리
0,SPB-00003,2020-04-01,0,3,360,1,0,120,360
1,SPB-00003,2020-04-01,0,25,2321,2,0,92,1160
2,SPB-00003,2020-04-01,0,63,5594,3,0,88,1864
3,SPB-00003,2020-04-01,0,82,7290,4,0,88,1822
4,SPB-00003,2020-04-01,0,88,7899,5,0,89,1579


### 4. 자전거 나이

In [8]:
#자전거 수명 구하기

#자전거 번호 별 첫 대여일만 남긴 프레임
first_df = final_df.drop_duplicates(['자전거번호'], keep='first') #첫 대여일만 남기기
first_df = first_df.iloc[:, :2] #자전거번호랑 등록일시만 남기기
first_df.rename(columns={'등록일시': '첫 대여일'}, inplace=True) #열 이름 바꾸기

#두 데이터 합치기
final_df = final_df.merge(right=first_df, how='left', on='자전거번호', suffixes=('_',''))

#수명 구하기
final_df['나이'] = final_df['등록일시'] - final_df['첫 대여일']

final_df.tail()
#결측치 없음

Unnamed: 0,자전거번호,등록일시,고장구분,누적이용시간,누적이용거리,누적이용횟수,누적고장횟수,이용강도,평균이용거리,첫 대여일,나이
54044158,SPB-90005,2020-08-19,0,36,3340,3,0,92,1113,2020-06-25,55 days
54044159,SPB-90005,2020-08-19,0,57,5197,4,0,91,1299,2020-06-25,55 days
54044160,SPB-90005,2020-08-19,0,62,5723,5,0,92,1144,2020-06-25,55 days
54044161,SPB-90005,2020-08-20,0,64,5970,6,0,93,995,2020-06-25,56 days
54044162,SPB-90005,2020-08-20,0,109,9848,7,0,90,1406,2020-06-25,56 days


### 5. 샘플링

In [9]:
#연도별로 자르기
df_2020 = final_df[final_df['등록일시'].dt.year == 2020]
df_2021 = final_df[final_df['등록일시'].dt.year == 2021]

#연도별 고장데이터 개수로 샘플링
break_2020 = df_2020[df_2020['고장구분'] == 1] #2020년에 고장난 자전거 데이터프레임
break_2020 = break_2020.sample(10000)

alive_2020 = df_2020[df_2020['고장구분'] == 0] #2020년에 멀쩡한 자전거 데이터프레임
alive_2020 = alive_2020.sample(10000) #샘플링


break_2021 = df_2021[df_2021['고장구분'] == 1] #2021년에 고장난 자전거 데이터프레임
break_2021 = break_2021.sample(10000)

alive_2021 = df_2021[df_2021['고장구분'] == 0] #2021년에 멀쩡한 자전거 데이터프레임
alive_2021 = alive_2021.sample(10000) #샘플링

In [10]:
data = pd.concat([break_2020, break_2021, alive_2020, alive_2021], ignore_index=True)

In [11]:
data.head()

Unnamed: 0,자전거번호,등록일시,고장구분,누적이용시간,누적이용거리,누적이용횟수,누적고장횟수,이용강도,평균이용거리,첫 대여일,나이
0,SPB-43181,2020-10-28,1,14910,1398094,458,1,93,3052,2020-09-04,54 days
1,SPB-16604,2020-03-04,1,1281,189910,78,1,148,2434,2020-01-02,62 days
2,SPB-42701,2020-09-09,1,8324,769168,332,3,92,2316,2020-07-14,57 days
3,SPB-31194,2020-08-01,1,31610,2988819,949,1,94,3149,2020-04-18,105 days
4,SPB-43378,2020-09-03,1,5587,518114,176,1,92,2943,2020-07-24,41 days


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype          
---  ------  --------------  -----          
 0   자전거번호   40000 non-null  object         
 1   등록일시    40000 non-null  datetime64[ns] 
 2   고장구분    40000 non-null  int64          
 3   누적이용시간  40000 non-null  int64          
 4   누적이용거리  40000 non-null  int64          
 5   누적이용횟수  40000 non-null  int64          
 6   누적고장횟수  40000 non-null  int64          
 7   이용강도    40000 non-null  int32          
 8   평균이용거리  40000 non-null  int32          
 9   첫 대여일   40000 non-null  datetime64[ns] 
 10  나이      40000 non-null  timedelta64[ns]
dtypes: datetime64[ns](2), int32(2), int64(5), object(1), timedelta64[ns](1)
memory usage: 3.1+ MB


In [13]:
data.describe()

Unnamed: 0,고장구분,누적이용시간,누적이용거리,누적이용횟수,누적고장횟수,이용강도,평균이용거리,나이
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000
mean,0.5,26889.63215,2898655.0,924.8135,3.0486,112.168825,3255.142525,198 days 12:46:48
std,0.500006,22517.767785,2358642.0,784.955759,3.200249,22.718946,750.066506,156 days 16:52:23.564273898
min,0.0,1.0,360.0,1.0,0.0,18.0,360.0,0 days 00:00:00
25%,0.0,8527.0,997336.8,291.0,1.0,96.0,2915.0,72 days 00:00:00
50%,0.5,20619.5,2280750.0,694.0,2.0,105.0,3100.0,153 days 00:00:00
75%,1.0,40154.5,4274894.0,1379.0,4.0,121.0,3377.0,313 days 00:00:00
max,1.0,118954.0,12746410.0,4260.0,37.0,886.0,29070.0,664 days 00:00:00


### 6.여름횟수

In [14]:
summer_df = data.loc[:, ['등록일시', '첫 대여일']]
summer_df

Unnamed: 0,등록일시,첫 대여일
0,2020-10-28,2020-09-04
1,2020-03-04,2020-01-02
2,2020-09-09,2020-07-14
3,2020-08-01,2020-04-18
4,2020-09-03,2020-07-24
...,...,...
39995,2021-04-06,2020-06-11
39996,2021-07-20,2020-06-18
39997,2021-08-10,2020-04-22
39998,2021-04-02,2020-05-25


In [15]:
#여름 횟수 구하는 함수
def get_summer(i) :
    #첫대여일~등록일시 까지의 월
    k = pd.period_range(start=summer_df.iloc[i, 1], end=summer_df.iloc[i, 0], freq='M')

    #프레임 만들어주기
    kk = pd.DataFrame(index=k, data=[0]*len(k))

    #월이 7월인 것만 빼오기
    kkk = list(filter(lambda x: x==7, kk.index.month))

    #7월의 개수 세기
    return(len(kkk))

In [16]:
index_list = data.index.to_series() #인덱스만 담은 시리즈

#df에 '여름'이라는 열을 만들고 여름 개수를 넣자
data['여름'] = index_list.map(lambda x: get_summer(x)) 

data

Unnamed: 0,자전거번호,등록일시,고장구분,누적이용시간,누적이용거리,누적이용횟수,누적고장횟수,이용강도,평균이용거리,첫 대여일,나이,여름
0,SPB-43181,2020-10-28,1,14910,1398094,458,1,93,3052,2020-09-04,54 days,0
1,SPB-16604,2020-03-04,1,1281,189910,78,1,148,2434,2020-01-02,62 days,0
2,SPB-42701,2020-09-09,1,8324,769168,332,3,92,2316,2020-07-14,57 days,1
3,SPB-31194,2020-08-01,1,31610,2988819,949,1,94,3149,2020-04-18,105 days,1
4,SPB-43378,2020-09-03,1,5587,518114,176,1,92,2943,2020-07-24,41 days,1
...,...,...,...,...,...,...,...,...,...,...,...,...
39995,SPB-39164,2021-04-06,0,13880,1496592,451,3,107,3318,2020-06-11,299 days,1
39996,SPB-40299,2021-07-20,0,17637,1913195,649,4,108,2947,2020-06-18,397 days,2
39997,SPB-33321,2021-08-10,0,55473,5723969,1937,6,103,2955,2020-04-22,475 days,2
39998,SPB-37079,2021-04-02,0,31721,3290155,1079,0,103,3049,2020-05-25,312 days,1


In [17]:
#필요없는 데이터 지우기
data.drop(['자전거번호', '등록일시', '첫 대여일'], axis=1, inplace=True)

#나이열 str 형식으로 바꾸기
data = data.astype({'나이' : str}, errors='raise')

#나이열 숫자만 남기기
data['나이'] = data['나이'].str.split(' ').str[0]

#나이열 int 형식으로 바꾸기
data = data.astype({'나이' : int}, errors='raise')

### 7. 데이터셋 완성

In [18]:
data.head()

Unnamed: 0,고장구분,누적이용시간,누적이용거리,누적이용횟수,누적고장횟수,이용강도,평균이용거리,나이,여름
0,1,14910,1398094,458,1,93,3052,54,0
1,1,1281,189910,78,1,148,2434,62,0
2,1,8324,769168,332,3,92,2316,57,1
3,1,31610,2988819,949,1,94,3149,105,1
4,1,5587,518114,176,1,92,2943,41,1


In [19]:
#열이름 바꾸기
data.rename(columns={'고장구분': 'breakdown', '누적이용시간' : 'cumTime', '누적이용거리' : 'cumDist', '누적이용횟수' : 'cumRide', 
    '누적고장횟수' : 'cumBreak', '이용강도': 'intensity', '평균이용거리' : 'meanDist', '나이' : 'age', '여름' : 'summer'}, inplace=True)

In [20]:
data.head()

Unnamed: 0,breakdown,cumTime,cumDist,cumRide,cumBreak,intensity,meanDist,age,summer
0,1,14910,1398094,458,1,93,3052,54,0
1,1,1281,189910,78,1,148,2434,62,0
2,1,8324,769168,332,3,92,2316,57,1
3,1,31610,2988819,949,1,94,3149,105,1
4,1,5587,518114,176,1,92,2943,41,1


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   breakdown  40000 non-null  int64
 1   cumTime    40000 non-null  int64
 2   cumDist    40000 non-null  int64
 3   cumRide    40000 non-null  int64
 4   cumBreak   40000 non-null  int64
 5   intensity  40000 non-null  int32
 6   meanDist   40000 non-null  int32
 7   age        40000 non-null  int32
 8   summer     40000 non-null  int64
dtypes: int32(3), int64(6)
memory usage: 2.3 MB


In [22]:
#데이터 저장
data.to_csv("bike_data.csv", mode='w')