# 자전거 대여량 예측

# 1. 데이터 확보

In [1]:
url = 'https://raw.githubusercontent.com/leekyuyoung20221226/python/main/data/bike.csv'
import numpy as np
import pandas as pd

In [2]:
bike = pd.read_csv(url)
bike.head(2)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,131,670,801


# 2 데이터 전처리

### 결측치 확인 - 중간값이나 또는 평균으로 대처

In [3]:
bike.isnull().sum()

instant       0
dteday        0
season        0
yr            1
mnth          1
holiday       0
weekday       0
workingday    0
weathersit    0
temp          1
atemp         1
hum           3
windspeed     5
casual        0
registered    0
cnt           0
dtype: int64

In [4]:
# 중간값으로 채우기  windspeed
bike['windspeed'].fillna( bike['windspeed'].median(),inplace=True)

In [5]:
bike.loc[730,'yr'] = 1.0
bike.loc[730,'mnth'] = 12.0

In [6]:
bike['temp'].fillna( bike['temp'].median(),inplace=True)
bike['atemp'].fillna( bike['atemp'].median(),inplace=True)
bike['hum'].fillna( bike['hum'].median(),inplace=True)

In [7]:
# 결측치 데이터 출력하기
bike[bike.isnull().any(axis=1)]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt


In [8]:
# 수치형 데이터만 선택하기
bike.drop(columns=['dteday'],inplace=True)

In [9]:
bike.head(3)

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349


In [10]:
# 일단 모든 컬럼 다 사용하기

In [12]:
# 데이터 표준화 안해도 됨 - xgboost 사용

In [39]:
# 특성과 타깃 ( 독립변수, 종속변수)
X = bike.iloc[:,:-1].to_numpy()
Y = bike.iloc[:,-1].to_numpy()

In [40]:
from xgboost import XGBRegressor
xgbr = XGBRegressor(random_state=0)
from sklearn.model_selection import cross_validate
scores = cross_validate(xgbr,X,Y,n_jobs=-1, return_train_score=True,cv=10)
np.mean(scores['train_score']), np.mean(scores['test_score'])

(0.9999967355524811, 0.9678961166519209)

In [41]:
from lightgbm import LGBMRegressor
lgbmr = LGBMRegressor(random_state=0)
from sklearn.model_selection import cross_validate
scores = cross_validate(lgbmr,X,Y,n_jobs=-1, return_train_score=True,cv=10)
np.mean(scores['train_score']), np.mean(scores['test_score'])

(0.9991020117374685, 0.9487338663481599)

In [42]:
# 데이터 분리 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =  train_test_split(X,Y,test_size=0.3,random_state=0)

In [43]:
xgbr.fit(x_train,y_train)
xgbr.score(x_train,y_train), xgbr.score(x_test,y_test)

(0.9999990216334502, 0.9952211899065297)

In [55]:
from sklearn.linear_model  import LinearRegression
lr = LinearRegression()
lr.fit(X, Y)
lr.score(X,Y)

1.0

In [53]:
x_test[12], y_test[12]

(array([5.60000e+01, 1.00000e+00, 0.00000e+00, 2.00000e+00, 0.00000e+00,
        5.00000e+00, 1.00000e+00, 2.00000e+00, 3.64348e-01, 3.50461e-01,
        7.12174e-01, 3.46539e-01, 1.20000e+02, 1.34100e+03]),
 1461)

In [54]:
lr.predict([x_test[12]])

array([1461.])

In [56]:
bike.head(1)

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985


In [61]:
lr.predict([X[0]])

array([985.])

In [62]:
####################################

In [127]:
# 인구조사 데이터 셋
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url,header=None)
df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K


In [128]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                  'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                   'income']

In [129]:
df.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K


In [130]:
# 데이터 전처리
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [131]:
# 중복되는 의미의 컬럼(피처)은 삭제
df.drop(columns=['education'],inplace=True)

In [132]:
df['workclass'].replace(' ?', np.nan, inplace=True)

In [133]:
df['occupation'].replace(' ?', np.nan, inplace=True)

In [134]:
df['native-country'].replace(' ?', np.nan, inplace=True)

In [135]:
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [136]:
len(df)

32561

In [142]:
df2 = df.dropna()

In [143]:
# 문자형 데이터를 수치화 한다...  one hot encoding
df2 = pd.get_dummies(df2)

In [147]:
df2.reset_index(drop=True,inplace=True)

In [148]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30162 entries, 0 to 30161
Data columns (total 90 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   age                                         30162 non-null  int64
 1   fnlwgt                                      30162 non-null  int64
 2   education-num                               30162 non-null  int64
 3   capital-gain                                30162 non-null  int64
 4   capital-loss                                30162 non-null  int64
 5   hours-per-week                              30162 non-null  int64
 6   workclass_ Federal-gov                      30162 non-null  uint8
 7   workclass_ Local-gov                        30162 non-null  uint8
 8   workclass_ Private                          30162 non-null  uint8
 9   workclass_ Self-emp-inc                     30162 non-null  uint8
 10  workclass_ Self-emp-not-inc       

In [154]:
# income_ <=50K 열을 삭제
df2.drop(columns=['income_ <=50K'],inplace=True)

In [155]:
# 가장 적합한 모델을 찾는 방법
X = df2.iloc[:,:-1].to_numpy()
Y = df2.iloc[:,-1].to_numpy()

In [156]:
# 선형회귀 - 로지스틱회귀
# 트리 -  앙상블포함 xgboost
# 이웃 - KneigborsClass

In [157]:
# 데이터 표준화
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X = ss.fit_transform(X)

In [163]:
# 교차검증 함수
def cross_val(classifier, num_split = 10):
    scores = cross_validate(classifier,X,Y, cv=num_split,return_train_score=True)
    return np.mean(scores['train_score']), np.mean(scores['test_score'])

In [164]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [165]:
lr = LogisticRegression()
xgbc = XGBClassifier()
knc = KNeighborsClassifier()

In [166]:
print(cross_val(lr))
print(cross_val(xgbc))
print(cross_val(knc))

(0.8496341996398129, 0.8473575908050666)
(0.9004523821081578, 0.8678469881986164)
(0.8754208813106785, 0.8243157254778184)


In [169]:
cross_val(XGBClassifier(n_estimators = 10))

(0.8646973067474641, 0.860056035252051)