## Numpy random

[numpy random 함수]

- np.random.seed : seed를 통한 난수 생성
- np.random.randint : 정수 난수 1개 생성
- np.random.rand : 0부터 1사이의 균일분포에서 난수 매트릭스 배열 생성
- np.random.randn : 가우시안 표준 정규분포에서 난수 매트릭스 배열 생성
- np.random.shuffle : 기존의 데이터의 순서 바꾸기
- np.random.choice : 기존 데이터에서 sampling

## Python random

In [8]:
import random

# 0.0 <= x < 1.0 사이 난수 리턴
print(random.random(),'\n') 

# 1 <= x <= 10 사이의 정수중에서 난수 값 리턴
print(random.randint(1,10),'\n')  

# min, max 사이 float 리턴
print(random.uniform(10,20),'\n') 

# 지정 범위 int 리턴
print(random.randrange(10),'\n')

# 리스트 내부에 있는 요소를 랜덤하게 선택
print(random.choice([1,2,3,4,5]),'\n')

# 리스트 요소를 중복이 안되게 리턴
li = [1,2,3,4,5]
print(random.sample(li, 3), '\n')

# 리스트 요소를 다시 섞어서 리턴  /요고는 print 안되고 list를 print해주기
random.shuffle(li)
print(li)

0.5620395947406239 

10 

18.521177560402517 

3 

2 

[2, 4, 5] 

[3, 5, 4, 1, 2]


#### 균일분포

In [9]:
import numpy as np

# 0 ~ 1 사이의 균일분포
r = np.random.rand(6)
print(r)
r.mean()

[0.15008269 0.57175707 0.65920109 0.41453769 0.02717328 0.05395647]


0.31278471436789385

In [11]:
# 오우 100만개 정도 하니까 균일분포가 0.5 나옴
r = np.random.rand(100000000)
r.mean()

0.5000325477033246

#### 표준 정규분포

In [12]:
r = np.random.randn(10000)
print(r,'\n')
r.mean()

[-0.55536655  0.11661107 -0.89993347 ... -0.08073371 -0.03780297
 -0.61741271] 



-0.009580168767124928

In [13]:
# 정수일때는 randint 실수일때는 uniform

np.random.uniform(1.0,2.0,10)  # 1.0 ~ 2.0 사이 size = 10개

array([1.93402329, 1.05627126, 1.61529779, 1.28372849, 1.09881742,
       1.56017738, 1.74968748, 1.28506328, 1.06536399, 1.5438524 ])

In [14]:
np.random.uniform(0,1,(2,3))   # array로도 뽑을 수 이따

array([[0.40178374, 0.16349877, 0.41180253],
       [0.02443944, 0.48143995, 0.81859135]])

In [15]:
# 0 ~ 1 사이의 난수 실수값으로 행렬 생성
# uniform 함수의 최소값, 최대값을 0,1 로 지정한 것과 같음
np.random.random_sample((2,3))

array([[0.4001426 , 0.75776487, 0.87898522],
       [0.7825264 , 0.35783173, 0.91460699]])

In [16]:
import pandas as pd
ar = np.random.rand(4,8)
ar

array([[0.84392188, 0.21492021, 0.23776254, 0.1143509 , 0.46506577,
        0.98318956, 0.47379706, 0.49668981],
       [0.9916345 , 0.40389289, 0.04923135, 0.15960645, 0.46745635,
        0.36712318, 0.07560576, 0.94848806],
       [0.39957862, 0.09468514, 0.29854479, 0.46106686, 0.4529827 ,
        0.55911661, 0.86329945, 0.32638902],
       [0.6497701 , 0.45260875, 0.23090418, 0.53205539, 0.06568029,
        0.41516386, 0.86115658, 0.84009617]])

In [None]:
# 배열을 df로 변환
# columns 8개에 대한 이름
# columns = ['id','gender','age','region','product','price','qty','time']

In [None]:
# Q. id, gender, age 컬럼에 대하여 의미있는 값으로 변환하여 3개의 컬럼으로 구성된 df3을 출력하세요.

In [46]:
df = pd.DataFrame(ar,columns = ['id','gender','age','region','product','price','qty','time']
)
df

Unnamed: 0,id,gender,age,region,product,price,qty,time
0,0.843922,0.21492,0.237763,0.114351,0.465066,0.98319,0.473797,0.49669
1,0.991634,0.403893,0.049231,0.159606,0.467456,0.367123,0.075606,0.948488
2,0.399579,0.094685,0.298545,0.461067,0.452983,0.559117,0.863299,0.326389
3,0.64977,0.452609,0.230904,0.532055,0.06568,0.415164,0.861157,0.840096


In [50]:
df.id = np.random.randint(10,99,4)
df.gender = np.random.choice(['male','female'],4)
df.age = np.random.randint(0,100,4)
df[['id','gender','age']]

Unnamed: 0,id,gender,age
0,12,male,40
1,95,female,61
2,31,female,99
3,39,female,8


In [54]:
# 배열에서 정렬한 후 50%에 해당하는 원소만 출력
large_ar = np.random.randn(1000)
large_ar.sort()

In [55]:
large_ar[int(0.5*len(large_ar))]

0.011537137066204342

##### 배열 집합 연산

In [56]:
names = np.array(['Bob','Joe','Will','Joe','Joe'])
np.unique(names)   # unique 중복 안되게

array(['Bob', 'Joe', 'Will'], dtype='<U4')

In [57]:
names = np.array(['Bob','Joe','Will','Joe','Joe'])
sorted(set(names))

['Bob', 'Joe', 'Will']

In [58]:
# 첫번째 배열의 원소가 두번째 배열의 원소를 포함하는지 알려주는 불리언 배열을 반환

values = np.array([6,0,0,3,2,5,6])
np.in1d(values,[2,3,6])

array([ True, False, False,  True,  True, False,  True])

##### 배열 데이터의 입출력

In [59]:
ar = np.arange(10)
np.save('some_ar',ar)

In [60]:
np.load('some_ar.npy')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [61]:
# np.savez : 여러개의 배열을 압축된 형식으로 저장

np.savez('ar_archive.npz', a=ar, b=ar)

In [63]:
arch = np.load('ar_archive.npz')
arch['b']

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [65]:
a = np.arange(1,10)
print(a)
np.argsort(a)  # < 인덱스 뽑기

[1 2 3 4 5 6 7 8 9]


array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64)

In [68]:
# 인덱스 배열을 역순으로 출력

np.argsort(a)[::-1]  # [시작 인덱스 : 끝 인덱스 : step]

array([8, 7, 6, 5, 4, 3, 2, 1, 0], dtype=int64)

In [69]:
np.argsort(a)[::-2]  # step 이니까 -2개씩 띄워서

array([8, 6, 4, 2, 0], dtype=int64)

### 도전과제 (7/6)

- np.arange(336).reshape(6,7,8)에서 100번째 요소의 인덱스를 구하세요.
- np.arange(4)을 이용해서 아래와 같이 출력하세요.

In [None]:
array([[0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 2, 0, 0, 0],
       [0, 0, 3, 0, 0],
       [0, 0, 0, 4, 0]])

In [71]:
# Q. 0 ~ 10 정수 (5,5) 행렬의 역행렬을 구한 후 두 행렬간의 행렬곱을 구하세요.

In [88]:
np.random.seed(0)
a = np.random.randint(11,size=(5,5))
print(a)
b = np.linalg.inv(a)
print(b)
c = np.dot(a,b)     # 행렬 간 곱 np.dot 요고는 항등행렬!! (행렬*역행렬)
print(c)

[[ 5  0  3  3  7]
 [ 9  3  5  2  4]
 [ 7  6  8  8 10]
 [ 1  6  7  7  8]
 [ 1  5  9  8  9]]
[[-0.05202312  0.00578035  0.20231214 -0.19075145 -0.01734104]
 [-0.03179191  0.11464355 -0.15414258  0.55009634 -0.34393064]
 [-0.02023121  0.22447013 -0.31021195 -0.07418112  0.3265896 ]
 [-0.3150289  -0.40944123  0.83622351 -0.82177264  0.2283237 ]
 [ 0.32369942  0.07514451 -0.3699422   0.52023121 -0.22543353]]
[[ 1.00000000e+00 -1.38777878e-17  1.11022302e-16 -1.11022302e-16
  -3.05311332e-16]
 [ 0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [-1.11022302e-16  3.60822483e-16  1.00000000e+00  2.22044605e-16
  -5.55111512e-17]
 [ 0.00000000e+00  1.11022302e-16  0.00000000e+00  1.00000000e+00
   2.22044605e-16]
 [ 1.66533454e-16  1.38777878e-17  3.33066907e-16  1.11022302e-16
   1.00000000e+00]]




### 과제 (7/6) .2 : 아래 가이드에 따라서 고객별 연간 구매데이터를 생성하고 아래사항을 수행하세요(관측치 1000개)

#### Q1. 고객 속성(id, 성별, 연령, 거주지역) 데이터와 거래 데이터를 통합한 데이터 프레임 df를 생성하세요.

거래 번호
- id : 1 ~ 1000번 일련번호 - 개시 후 1만 건의 거래가 있었고 걍 거기에 넘버링한 거라고 이해해보자

고객별 속성 데이터
- cusno : 고객번호 1000개 랜덤
- gender : 0,1 정수 난수 생성
- age : 0 ~ 80사이 정수 난수 생성
- region : 1 ~ 10 사이 정수 난수 생성

거래 데이터
- product_20 : 고가제품(10), 중가제품(20), 저가제품(30)로 구분제품코드 10,20,30
- product_21 : 고가제품(10), 중가제품(20), 저가제품(30)로 구분제품코드 10,20,30
- price_20 : 1000 ~ 50000 사이 실수 난수 생성
- price_21 : 1000 ~ 50000 사이 실수 난수 생성
- quantity_20 : 1 ~ 100 사이 정수 난수 생성
- quantity_21 : 1 ~ 100 사이 정수 난수 생성
- amount_20 : price_20 * quantity_20
- amount_21 : price_21 * quantity_21
- sales : 20년 대비 21년구매금액이 증가면 1, 감소면 0

#### Q2. df를 수정하여 3가지 이상의 인사이트를 포함한 데이터 셋으로 변환하세요

#### Q3. df 를 탐색적 분석을 통하여 인사이트를 도출하세요.


In [73]:
import numpy as np
import pandas as pd
import random
from datetime import datetime
import time

In [74]:
a = np.random.rand(1000, 5)
df1 = pd.DataFrame(a, columns=['id','cusno','gender','age','region'])
df1.head()

Unnamed: 0,id,cusno,gender,age,region
0,0.888171,0.853925,0.103969,0.349944,0.496696
1,0.878895,0.981854,0.336949,0.781949,0.858717
2,0.281249,0.198927,0.646614,0.564139,0.154576
3,0.742173,0.360093,0.604674,0.006493,0.590515
4,0.733426,0.345674,0.75933,0.42697,0.4232


In [75]:
df1.id = np.arange(1,1001)
df1.cusno = np.arange(5000,6000)
df1.gender = np.random.choice([0,1],1000)
df1.age = np.random.randint(0,80,1000)
df1.region = np.random.randint(1,11,1000)

df1

Unnamed: 0,id,cusno,gender,age,region
0,1,5000,0,18,10
1,2,5001,0,70,7
2,3,5002,1,32,9
3,4,5003,0,60,1
4,5,5004,0,0,3
...,...,...,...,...,...
995,996,5995,1,42,9
996,997,5996,1,9,9
997,998,5997,1,5,6
998,999,5998,1,76,4


In [76]:
b = np.random.rand(1000,9)
df2 = pd.DataFrame(b, columns=['product_20','product_21','price_20','price_21',
                              'quantity_20','quantity_21','amount_20','amount_21','sales'])
df2

Unnamed: 0,product_20,product_21,price_20,price_21,quantity_20,quantity_21,amount_20,amount_21,sales
0,0.939065,0.619266,0.650981,0.256275,0.590762,0.098127,0.308161,0.426586,0.839062
1,0.813440,0.082432,0.493159,0.918688,0.108576,0.947277,0.397584,0.255584,0.509276
2,0.849214,0.197932,0.980296,0.756175,0.726078,0.612664,0.845544,0.504839,0.839857
3,0.529656,0.771732,0.601908,0.999252,0.509133,0.848014,0.922649,0.892783,0.345275
4,0.114321,0.501498,0.808487,0.798228,0.341778,0.638812,0.597658,0.071404,0.691058
...,...,...,...,...,...,...,...,...,...
995,0.890174,0.211140,0.909633,0.066321,0.829732,0.064619,0.702787,0.483917,0.196372
996,0.196225,0.734503,0.224368,0.665529,0.528688,0.488669,0.757247,0.288912,0.005643
997,0.469614,0.553114,0.807852,0.803253,0.005662,0.181468,0.838580,0.523375,0.503152
998,0.223112,0.721037,0.133675,0.625350,0.970696,0.586704,0.431983,0.792832,0.062805


In [77]:
np.random.seed(1)
df2.product_20 = np.random.choice([10,20,30],1000)
df2.product_21 = np.random.choice([10,20,30],1000)
df2.price_20 = np.random.uniform(1000,50000,1000)
df2.price_21 = np.random.uniform(1000,50000,1000)
df2.quantity_20 = np.random.randint(1,100,1000)
df2.quantity_21 = np.random.randint(1,100,1000)

df2.amount_20 = df2.price_20 * df2.quantity_20
df2.amount_21 = df2.price_21 * df2.quantity_21
df2

Unnamed: 0,product_20,product_21,price_20,price_21,quantity_20,quantity_21,amount_20,amount_21,sales
0,20,20,28239.110345,16587.328865,7,68,1.976738e+05,1.127938e+06,0.839062
1,10,20,27301.647775,38558.115088,47,67,1.283177e+06,2.583394e+06,0.509276
2,10,30,20139.914438,15087.407000,13,43,2.618189e+05,6.487585e+05,0.839857
3,20,30,35117.234072,47904.041776,62,55,2.177269e+06,2.634722e+06,0.345275
4,20,20,34245.640246,35413.471985,50,53,1.712282e+06,1.876914e+06,0.691058
...,...,...,...,...,...,...,...,...,...
995,20,10,30322.933897,49810.930431,90,38,2.729064e+06,1.892815e+06,0.196372
996,30,10,19821.597172,6479.387748,52,72,1.030723e+06,4.665159e+05,0.005643
997,20,30,8190.648606,27021.320375,96,87,7.863023e+05,2.350855e+06,0.503152
998,10,20,35147.037995,40862.802395,63,74,2.214263e+06,3.023847e+06,0.062805


In [78]:
df2.price_20 = round(df2['price_20'], 2)
df2.price_21 = round(df2['price_21'], 2)
df2

Unnamed: 0,product_20,product_21,price_20,price_21,quantity_20,quantity_21,amount_20,amount_21,sales
0,20,20,28239.11,16587.33,7,68,1.976738e+05,1.127938e+06,0.839062
1,10,20,27301.65,38558.12,47,67,1.283177e+06,2.583394e+06,0.509276
2,10,30,20139.91,15087.41,13,43,2.618189e+05,6.487585e+05,0.839857
3,20,30,35117.23,47904.04,62,55,2.177269e+06,2.634722e+06,0.345275
4,20,20,34245.64,35413.47,50,53,1.712282e+06,1.876914e+06,0.691058
...,...,...,...,...,...,...,...,...,...
995,20,10,30322.93,49810.93,90,38,2.729064e+06,1.892815e+06,0.196372
996,30,10,19821.60,6479.39,52,72,1.030723e+06,4.665159e+05,0.005643
997,20,30,8190.65,27021.32,96,87,7.863023e+05,2.350855e+06,0.503152
998,10,20,35147.04,40862.80,63,74,2.214263e+06,3.023847e+06,0.062805


In [103]:
df2.sales = df2.amount_21 - df2.amount_20
df2

Unnamed: 0,product_20,product_21,price_20,price_21,quantity_20,quantity_21,amount_20,amount_21,sales
0,20,20,28239.11,16587.33,7,68,1.976738e+05,1.127938e+06,9.302646e+05
1,10,20,27301.65,38558.12,47,67,1.283177e+06,2.583394e+06,1.300216e+06
2,10,30,20139.91,15087.41,13,43,2.618189e+05,6.487585e+05,3.869396e+05
3,20,30,35117.23,47904.04,62,55,2.177269e+06,2.634722e+06,4.574538e+05
4,20,20,34245.64,35413.47,50,53,1.712282e+06,1.876914e+06,1.646320e+05
...,...,...,...,...,...,...,...,...,...
995,20,10,30322.93,49810.93,90,38,2.729064e+06,1.892815e+06,-8.362487e+05
996,30,10,19821.60,6479.39,52,72,1.030723e+06,4.665159e+05,-5.642071e+05
997,20,30,8190.65,27021.32,96,87,7.863023e+05,2.350855e+06,1.564553e+06
998,10,20,35147.04,40862.80,63,74,2.214263e+06,3.023847e+06,8.095840e+05


In [104]:
df2['sales'] = df2.sales.apply(lambda x:1 if x > 0 else 0) #***
df2

Unnamed: 0,product_20,product_21,price_20,price_21,quantity_20,quantity_21,amount_20,amount_21,sales
0,20,20,28239.11,16587.33,7,68,1.976738e+05,1.127938e+06,1
1,10,20,27301.65,38558.12,47,67,1.283177e+06,2.583394e+06,1
2,10,30,20139.91,15087.41,13,43,2.618189e+05,6.487585e+05,1
3,20,30,35117.23,47904.04,62,55,2.177269e+06,2.634722e+06,1
4,20,20,34245.64,35413.47,50,53,1.712282e+06,1.876914e+06,1
...,...,...,...,...,...,...,...,...,...
995,20,10,30322.93,49810.93,90,38,2.729064e+06,1.892815e+06,0
996,30,10,19821.60,6479.39,52,72,1.030723e+06,4.665159e+05,0
997,20,30,8190.65,27021.32,96,87,7.863023e+05,2.350855e+06,1
998,10,20,35147.04,40862.80,63,74,2.214263e+06,3.023847e+06,1


In [98]:
compare = df2['amount_20'] > df2['amount_21']
df2['sales'] = compare
df2

Unnamed: 0,product_20,product_21,price_20,price_21,quantity_20,quantity_21,amount_20,amount_21,sales
0,20,20,28239.110345,16587.328865,7,68,1.976738e+05,1.127938e+06,False
1,10,20,27301.647775,38558.115088,47,67,1.283177e+06,2.583394e+06,False
2,10,30,20139.914438,15087.407000,13,43,2.618189e+05,6.487585e+05,False
3,20,30,35117.234072,47904.041776,62,55,2.177269e+06,2.634722e+06,False
4,20,20,34245.640246,35413.471985,50,53,1.712282e+06,1.876914e+06,False
...,...,...,...,...,...,...,...,...,...
995,20,10,30322.933897,49810.930431,90,38,2.729064e+06,1.892815e+06,True
996,30,10,19821.597172,6479.387748,52,72,1.030723e+06,4.665159e+05,True
997,20,30,8190.648606,27021.320375,96,87,7.863023e+05,2.350855e+06,False
998,10,20,35147.037995,40862.802395,63,74,2.214263e+06,3.023847e+06,False


In [17]:
def tf(x):
    if x == True :
        return 1
    else :
        return 0

In [37]:
df2['sales'] = df2['sales'].apply(lambda x : tf(x))
df2

Unnamed: 0,product_20,product_21,price_20,price_21,quantity_20,quantity_21,amount_20,amount_21,sales
0,20,20,28239.11,16587.33,7,68,1.976738e+05,1.127938e+06,0
1,10,20,27301.65,38558.12,47,67,1.283177e+06,2.583394e+06,0
2,10,30,20139.91,15087.41,13,43,2.618189e+05,6.487585e+05,0
3,20,30,35117.23,47904.04,62,55,2.177269e+06,2.634722e+06,0
4,20,20,34245.64,35413.47,50,53,1.712282e+06,1.876914e+06,0
...,...,...,...,...,...,...,...,...,...
995,20,10,30322.93,49810.93,90,38,2.729064e+06,1.892815e+06,1
996,30,10,19821.60,6479.39,52,72,1.030723e+06,4.665159e+05,1
997,20,30,8190.65,27021.32,96,87,7.863023e+05,2.350855e+06,0
998,10,20,35147.04,40862.80,63,74,2.214263e+06,3.023847e+06,0


In [105]:
df = df1.join(df2)
df

Unnamed: 0,id,cusno,gender,age,region,product_20,product_21,price_20,price_21,quantity_20,quantity_21,amount_20,amount_21,sales
0,1,5000,1,78,10,20,20,28239.11,16587.33,7,68,1.976738e+05,1.127938e+06,1
1,2,5001,1,45,6,10,20,27301.65,38558.12,47,67,1.283177e+06,2.583394e+06,1
2,3,5002,0,57,5,10,30,20139.91,15087.41,13,43,2.618189e+05,6.487585e+05,1
3,4,5003,0,70,10,20,30,35117.23,47904.04,62,55,2.177269e+06,2.634722e+06,1
4,5,5004,0,11,1,20,20,34245.64,35413.47,50,53,1.712282e+06,1.876914e+06,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,5995,1,72,8,20,10,30322.93,49810.93,90,38,2.729064e+06,1.892815e+06,0
996,997,5996,0,11,3,30,10,19821.60,6479.39,52,72,1.030723e+06,4.665159e+05,0
997,998,5997,0,46,8,20,30,8190.65,27021.32,96,87,7.863023e+05,2.350855e+06,1
998,999,5998,0,62,3,10,20,35147.04,40862.80,63,74,2.214263e+06,3.023847e+06,1


In [40]:
# df를 수정하여 5가지 이상의 인사이트를 포함한 데이터 셋으로 변환하세요

# 항상 고가 제품만을 구매하는 고객 층(나이, 거주지 영향)
# 상품 가격이 10% 올랐을 때 고가 제품을 구매하는 고객(나이, 거주지 영향)

In [29]:
# 항상 고가 제품만을 구매하는 고객 층(나이, 거주지 영향)

import warnings
warnings.filterwarnings('ignore')


df3 = df[(df['product_20'] == 10)&(df['product_21'] == 10)]
df3

# 거주지 1 ~ 10 중 1-5 이내
# 나이 40-70대 사이
# 제품 가격 올리기

# 그런 수치요약은 어디서 구함???

# 수량 올리기
# 총 구매금액 출력


df3.region = np.random.choice([1,2,3,4,5],116)
df3.age = np.random.randint(40,70,116)
df3

Unnamed: 0,cusno,gender,age,region,product_20,product_21,price_20,price_21,quantity_20,quantity_21,amount_20,amount_21,sales
14,982,0,54,3,10,10,45982,41179,75,7,3448650,288253,1
21,114,0,41,3,10,10,27725,48190,1,60,27725,2891400,0
29,132,1,63,4,10,10,36483,5091,38,50,1386354,254550,1
41,667,1,58,1,10,10,10781,5922,85,75,916385,444150,1
42,860,1,51,5,10,10,41373,39047,64,38,2647872,1483786,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,674,0,60,3,10,10,32776,36002,90,48,2949840,1728096,1
916,920,1,63,2,10,10,13318,14923,70,76,932260,1134148,0
919,438,1,54,3,10,10,45409,40824,42,64,1907178,2612736,0
951,483,0,67,1,10,10,5723,14363,15,58,85845,833054,0


#### Q4. 거래일자, 요일, 시간을 추가해서 10000개의 관측치를 갖는 데이터셋을 만드세요.

In [None]:
# 구매 감소 고객에 대한 데이터셋  >> 고객별 데이터를 뽑아야함 >> 구매 감소 요인들
# 여성의 구매가 줄어듬 구매 감소했으면 0 증가했으면 1 요고는 저번에 한거

In [161]:
import pandas as pd
import numpy as np
import random
from datetime import datetime
import time

In [162]:
# 고객별 데이터

random.seed(1)
a = np.random.rand(1000,4)
df1 = pd.DataFrame(a, columns=['cusno','gender','age','region'])

df1.cusno = np.arange(1000,2000)
df1.gender = np.random.choice([0,1], size = 1000)
df1.age = np.random.randint(10,81, 1000)
df1.region = np.random.randint(1,11,1000)

df1.head()

Unnamed: 0,cusno,gender,age,region
0,1000,1,48,2
1,1001,1,52,1
2,1002,0,53,8
3,1003,0,13,6
4,1004,0,53,4


In [163]:
# 거래 데이터
np.random.seed(2)

b = np.random.rand(10000,6)
df2 = pd.DataFrame(b, columns=['id','cusno','product','price','quantity','amount'])


df2.id = random.sample(range(10000,50000),10000)
df2.cusno = np.random.randint(1000,2000,10000)
df2.product = np.random.choice([10,20,30], size=10000)
df2.price = np.random.uniform(1000,50000,10000).round(2)   # << 소수점!!
df2.quantity = np.random.randint(1,100,10000)
df2.amount = df2.price * df2.quantity

df2.head()

Unnamed: 0,id,cusno,product,price,quantity,amount
0,18805,1276,0.549662,4044.19,41,165811.79
1,47303,1549,0.299655,49934.68,38,1897517.84
2,14135,1385,0.18444,9169.17,49,449289.33
3,26716,1393,0.505246,20056.54,82,1644636.28
4,17727,1874,0.226012,38152.0,77,2937704.0


In [164]:
df2['product'] = np.random.choice([10,20,30],10000)
df2.head()

Unnamed: 0,id,cusno,product,price,quantity,amount
0,18805,1276,20,4044.19,41,165811.79
1,47303,1549,30,49934.68,38,1897517.84
2,14135,1385,30,9169.17,49,449289.33
3,26716,1393,20,20056.54,82,1644636.28
4,17727,1874,30,38152.0,77,2937704.0


In [165]:
# 날짜 데이터

date_list = list(pd.date_range('2020-01-01', '2021-12-31', freq='D'))
df2['date'] = np.random.choice(date_list,10000)

df2['day'] = df2['date'].apply(lambda x: x.weekday())  # 젤 중요!
df2['day'] = df2['day'].replace(list(range(7)),['월','화','수','목','금','토','일'])

df2['time'] = np.random.randint(1,25,10000)

df2.head()

Unnamed: 0,id,cusno,product,price,quantity,amount,date,day,time
0,18805,1276,20,4044.19,41,165811.79,2021-09-11,토,12
1,47303,1549,30,49934.68,38,1897517.84,2020-03-19,목,5
2,14135,1385,30,9169.17,49,449289.33,2021-12-06,월,10
3,26716,1393,20,20056.54,82,1644636.28,2021-06-15,화,13
4,17727,1874,30,38152.0,77,2937704.0,2021-11-01,월,19


In [166]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        10000 non-null  int64         
 1   cusno     10000 non-null  int32         
 2   product   10000 non-null  int32         
 3   price     10000 non-null  float64       
 4   quantity  10000 non-null  int32         
 5   amount    10000 non-null  float64       
 6   date      10000 non-null  datetime64[ns]
 7   day       10000 non-null  object        
 8   time      10000 non-null  int32         
dtypes: datetime64[ns](1), float64(2), int32(4), int64(1), object(1)
memory usage: 547.0+ KB


In [167]:
df2['id'] = df2.apply(lambda x : str(x.date.year-2000)+str(x.id), axis=1)
df2.head()

Unnamed: 0,id,cusno,product,price,quantity,amount,date,day,time
0,2118805,1276,20,4044.19,41,165811.79,2021-09-11,토,12
1,2047303,1549,30,49934.68,38,1897517.84,2020-03-19,목,5
2,2114135,1385,30,9169.17,49,449289.33,2021-12-06,월,10
3,2126716,1393,20,20056.54,82,1644636.28,2021-06-15,화,13
4,2117727,1874,30,38152.0,77,2937704.0,2021-11-01,월,19


In [177]:
# 고객 + 거래 데이터

df = df1.merge(df2, on='cusno',how='outer')
df.head()

Unnamed: 0,cusno,gender,age,region,id,product,price,quantity,amount,date,day,time
0,1000,1,48,2,2026523,20,36659.45,84,3079393.8,2020-11-29,일,15
1,1000,1,48,2,2037901,10,47849.64,6,287097.84,2020-11-01,일,24
2,1000,1,48,2,2029161,20,3952.84,6,23717.04,2020-12-03,목,12
3,1000,1,48,2,2015381,10,5880.61,79,464568.19,2020-05-13,수,8
4,1000,1,48,2,2141980,30,13794.5,35,482807.5,2021-06-09,수,12


In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   cusno     10000 non-null  int32         
 1   gender    10000 non-null  int32         
 2   age       10000 non-null  int32         
 3   region    10000 non-null  int32         
 4   id        10000 non-null  object        
 5   product   10000 non-null  int32         
 6   price     10000 non-null  float64       
 7   quantity  10000 non-null  int32         
 8   amount    10000 non-null  float64       
 9   date      10000 non-null  datetime64[ns]
 10  day       10000 non-null  object        
 11  time      10000 non-null  int32         
dtypes: datetime64[ns](1), float64(2), int32(7), object(2)
memory usage: 742.2+ KB


In [179]:
# 데이터 전처리 문자화 하기

df.gender.replace({0:'Male',1:'Female'}, inplace=True)
df.head()

Unnamed: 0,cusno,gender,age,region,id,product,price,quantity,amount,date,day,time
0,1000,Female,48,2,2026523,20,36659.45,84,3079393.8,2020-11-29,일,15
1,1000,Female,48,2,2037901,10,47849.64,6,287097.84,2020-11-01,일,24
2,1000,Female,48,2,2029161,20,3952.84,6,23717.04,2020-12-03,목,12
3,1000,Female,48,2,2015381,10,5880.61,79,464568.19,2020-05-13,수,8
4,1000,Female,48,2,2141980,30,13794.5,35,482807.5,2021-06-09,수,12


In [180]:
df.region.replace({1:'서울',
                  2:'부산',
                  3:'대구',
                  4:'울산',
                  5:'대전',
                  6:'속초',
                  7:'경주',
                  8:'전주',
                  9:'광주',
                  10:'제주'}, inplace=True)
df.head()

Unnamed: 0,cusno,gender,age,region,id,product,price,quantity,amount,date,day,time
0,1000,Female,48,부산,2026523,20,36659.45,84,3079393.8,2020-11-29,일,15
1,1000,Female,48,부산,2037901,10,47849.64,6,287097.84,2020-11-01,일,24
2,1000,Female,48,부산,2029161,20,3952.84,6,23717.04,2020-12-03,목,12
3,1000,Female,48,부산,2015381,10,5880.61,79,464568.19,2020-05-13,수,8
4,1000,Female,48,부산,2141980,30,13794.5,35,482807.5,2021-06-09,수,12


In [181]:
# age 연령대별

def agecat(x):
    y=0
    if x<20:
        y='10대'
    elif x<30:
        y='20대'
    elif x<40:
        y='30대'
    elif x<50:
        y='40대'
    elif x<60:
        y='50대'
    else:
        y='60대 이상'
    return y

df['age_group']=df.age.apply(agecat)
df.head()

Unnamed: 0,cusno,gender,age,region,id,product,price,quantity,amount,date,day,time,age_group
0,1000,Female,48,부산,2026523,20,36659.45,84,3079393.8,2020-11-29,일,15,40대
1,1000,Female,48,부산,2037901,10,47849.64,6,287097.84,2020-11-01,일,24,40대
2,1000,Female,48,부산,2029161,20,3952.84,6,23717.04,2020-12-03,목,12,40대
3,1000,Female,48,부산,2015381,10,5880.61,79,464568.19,2020-05-13,수,8,40대
4,1000,Female,48,부산,2141980,30,13794.5,35,482807.5,2021-06-09,수,12,40대


In [182]:
# 가격, 수량, 판매금액 조정


def price_set(x):
    if x==10:
        y= np.random.uniform(200000,300001)
    elif x==20:
        y= np.random.uniform(100000,200001)
    elif x==30:
        y= np.random.uniform(10000,100000)
    return y

def quantity_set(x):
    if x==10:
        z=np.random.randint(1,30)
    elif x==20:
        z=np.random.randint(1,60)
    elif x==30:
        z=np.random.randint(1,100)
    return z

df['price']=df['product'].apply(price_set).round(0)
df['quantity']=df['product'].apply(quantity_set)

df.amount=df.price * df.quantity

df.head()

Unnamed: 0,cusno,gender,age,region,id,product,price,quantity,amount,date,day,time,age_group
0,1000,Female,48,부산,2026523,20,192288.0,24,4614912.0,2020-11-29,일,15,40대
1,1000,Female,48,부산,2037901,10,274050.0,7,1918350.0,2020-11-01,일,24,40대
2,1000,Female,48,부산,2029161,20,128189.0,54,6922206.0,2020-12-03,목,12,40대
3,1000,Female,48,부산,2015381,10,297543.0,11,3272973.0,2020-05-13,수,8,40대
4,1000,Female,48,부산,2141980,30,34084.0,90,3067560.0,2021-06-09,수,12,40대


In [183]:
# 시간 순서로 df를 정렬

df=df.sort_values(['id'])
df.head(10)

Unnamed: 0,cusno,gender,age,region,id,product,price,quantity,amount,date,day,time,age_group
2119,1216,Male,74,속초,2010003,30,95137.0,89,8467193.0,2020-10-27,화,11,60대 이상
9517,1952,Male,33,경주,2010007,10,251012.0,29,7279348.0,2020-12-04,금,21,30대
5084,1511,Male,45,대구,2010028,30,92500.0,58,5365000.0,2020-06-01,월,17,40대
2510,1253,Male,12,속초,2010029,10,289486.0,25,7237150.0,2020-10-10,토,5,10대
1900,1191,Female,25,경주,2010038,10,204261.0,26,5310786.0,2020-07-31,금,10,20대
375,1036,Female,25,대구,2010055,30,47159.0,12,565908.0,2020-04-26,일,2,20대
4726,1477,Male,12,대구,2010057,30,37308.0,47,1753476.0,2020-07-06,월,20,10대
816,1079,Male,62,제주,2010074,20,169043.0,59,9973537.0,2020-08-26,수,5,60대 이상
9458,1946,Female,28,전주,2010091,20,105888.0,8,847104.0,2020-07-21,화,11,20대
7777,1773,Male,56,서울,2010097,10,209010.0,27,5643270.0,2020-04-23,목,22,50대


In [184]:
# id를 인덱스로

df.set_index('id',inplace=True)
df

Unnamed: 0_level_0,cusno,gender,age,region,product,price,quantity,amount,date,day,time,age_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010003,1216,Male,74,속초,30,95137.0,89,8467193.0,2020-10-27,화,11,60대 이상
2010007,1952,Male,33,경주,10,251012.0,29,7279348.0,2020-12-04,금,21,30대
2010028,1511,Male,45,대구,30,92500.0,58,5365000.0,2020-06-01,월,17,40대
2010029,1253,Male,12,속초,10,289486.0,25,7237150.0,2020-10-10,토,5,10대
2010038,1191,Female,25,경주,10,204261.0,26,5310786.0,2020-07-31,금,10,20대
...,...,...,...,...,...,...,...,...,...,...,...,...
2149955,1904,Male,72,부산,10,258679.0,11,2845469.0,2021-11-24,수,16,60대 이상
2149957,1558,Female,29,경주,10,220849.0,25,5521225.0,2021-09-09,목,18,20대
2149964,1374,Male,38,광주,30,93315.0,94,8771610.0,2021-07-18,일,22,30대
2149965,1418,Male,32,대전,20,130822.0,38,4971236.0,2021-01-29,금,23,30대


In [108]:
# 구매 감소 고객에 대한 데이터셋  >> 고객별 데이터를 뽑아야함 >> 구매 감소 요인들
# 여성의 구매가 줄어듬 구매 감소했으면 0 증가했으면 1 요고는 저번에 한거

