### 필요한 라이브러리 불러오기 및 상수 선언

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 앞뒤로 각각 몇 개의 데이터를 기준을 삼을 것인지

NUM_NEIGHBORS_19 = 30   # 19년 데이터 
NUM_NEIGHBORS = 5       # 19년 이후 데이터 

### 데이터셋 가져오기

In [3]:
# path 설정
DATA_PATH = '/Users/sea/YBIGTAlab/project0/git0/datasets/thumbnail_csv/'

data19 = pd.read_csv(DATA_PATH + '2019.csv').sort_values(by='Published')
data20 = pd.read_csv(DATA_PATH + '2020.csv').sort_values(by='Published')
data21 = pd.read_csv(DATA_PATH + '2021.csv').sort_values(by='Published')
data22 = pd.read_csv(DATA_PATH + '2022.csv').sort_values(by='Published')
data23 = pd.read_csv(DATA_PATH + '2023.csv').sort_values(by='Published')

In [4]:
# dataset 하나로 합치기
dataFull = pd.concat((data19, data20, data21, data22, data23))

# index 초기화 및 불필요한 column 제거
dataFull.reset_index(inplace=True)
dataFull.drop(['index','Unnamed: 0'], axis=1, inplace=True)

In [5]:
# 결측치 유무 확인
dataFull.isnull().sum()

Published       0
ThumbnailUrl    0
ViewCount       0
VideoId         0
dtype: int64

In [6]:
# col 'Published' - datetime 형식으로 변경
dataFull['Published'] = pd.to_datetime(dataFull.Published)

# sort by 'Published'
dataFull = dataFull.sort_values(by='Published')

In [7]:
# 19년 이전 / 19년 이후 데이터셋 분리
before19 = dataFull[:len(data19)+NUM_NEIGHBORS_19].reset_index().drop('index', axis=1)
after19 = dataFull[len(data19)-NUM_NEIGHBORS:].reset_index().drop('index', axis=1)

### Labeling 함수 선언 및 실행

In [8]:
def getLabels(df, n) :

    # input : dataset to get labels, number of neighbors to get mean of views
    # output : dataset with labels and mean of neighbors

    # create columns 'NeighborsMean' & 'label'
    df['NeighborsMean'] = 0
    df['label'] = 0
    df = df.astype({'NeighborsMean':'int'})

    for i in range(len(df)):
        mean = 0

        # [ : NUM_NEIGHBORS - 1 ]
        if i in range(0, n):
            mean = df.iloc[:i+n+1]['ViewCount'].mean()
        
        # [ NUM_NEIGHBORS : length - NUM_NEIGHBORS - 1 ]
        elif i in range(n, len(df)-n):
            mean = df.iloc[i-n:i+n+1]['ViewCount'].mean()
        
        # [ length - NUM_NEIGHBORS : ]
        else:
            mean = df.iloc[i-n:]['ViewCount'].mean()
        
        df['NeighborsMean'][i] = round(mean)
        if df['ViewCount'][i] >= mean:
            df['label'][i] = 1
    
    return df

In [9]:
before19 = getLabels(before19, NUM_NEIGHBORS_19)
after19 = getLabels(after19, NUM_NEIGHBORS)

### 19년 이전 및 이후 데이터 합치기

In [10]:
dataLabeled = pd.concat((before19[:len(before19)-NUM_NEIGHBORS_19], after19[NUM_NEIGHBORS:]))
dataLabeled = dataLabeled.reset_index().drop('index', axis=1)

In [11]:
dataLabeled.head().style.format(thousands=',')

Unnamed: 0,Published,ThumbnailUrl,ViewCount,VideoId,NeighborsMean,label
0,2019-01-03 10:00:00+00:00,https://i.ytimg.com/vi/-4BvjRGDIAw/default.jpg,38478,-4BvjRGDIAw,219834,0
1,2019-01-03 10:00:01+00:00,https://i.ytimg.com/vi/HTpd6iJdxyk/default.jpg,69165,HTpd6iJdxyk,217615,0
2,2019-01-04 08:40:26+00:00,https://i.ytimg.com/vi/Tf7lwAJirNY/default.jpg,98047,Tf7lwAJirNY,214952,0
3,2019-01-04 10:00:08+00:00,https://i.ytimg.com/vi/l1LT2hccFfI/default.jpg,85222,l1LT2hccFfI,209538,0
4,2019-01-05 13:04:41+00:00,https://i.ytimg.com/vi/3cpJ6wSC3xI/default.jpg,398331,3cpJ6wSC3xI,219777,1


In [12]:
# 앞으로 모델링 단계에서 필요 없는 col 제거
dataset = dataLabeled.drop(['Published', 'ViewCount', 'NeighborsMean'], axis=1)

dataset

Unnamed: 0,ThumbnailUrl,VideoId,label
0,https://i.ytimg.com/vi/-4BvjRGDIAw/default.jpg,-4BvjRGDIAw,0
1,https://i.ytimg.com/vi/HTpd6iJdxyk/default.jpg,HTpd6iJdxyk,0
2,https://i.ytimg.com/vi/Tf7lwAJirNY/default.jpg,Tf7lwAJirNY,0
3,https://i.ytimg.com/vi/l1LT2hccFfI/default.jpg,l1LT2hccFfI,0
4,https://i.ytimg.com/vi/3cpJ6wSC3xI/default.jpg,3cpJ6wSC3xI,1
...,...,...,...
1853,https://i.ytimg.com/vi/nRBLhMZjazk/default.jpg,nRBLhMZjazk,0
1854,https://i.ytimg.com/vi/JgkdgAfTdBk/default.jpg,JgkdgAfTdBk,1
1855,https://i.ytimg.com/vi/mdsaZz5WVHU/default.jpg,mdsaZz5WVHU,1
1856,https://i.ytimg.com/vi/8hSds3puPQY/default.jpg,8hSds3puPQY,0


### 데이터셋 확인 및 csv 변환

In [13]:
# label 비율 확인
dataset['label'].value_counts()

0    1246
1     612
Name: label, dtype: int64

In [14]:
dataset.to_csv("LabeledData.csv")